Implemented finer grained neighbor list. Also various minor optimizations.

e04283fe · Peter Eastman · 56d7ed02 · e04283fe · e04283fe · e04283fe
Commit e04283fe authored Mar 10, 2009 by Peter Eastman
12 changed files
--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -47,7 +47,7 @@ using namespace std;

 static void calcForces(OpenMMContextImpl& context, CudaPlatform::PlatformData& data) {
    _gpuContext* gpu = data.gpu;
-    if (data.stepCount%100 == 0)
+    if (data.nonbondedMethod != NO_CUTOFF && data.stepCount%100 == 0)
        gpuReorderAtoms(gpu);
    data.stepCount++;
    kClearForces(gpu);

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -3127,10 +3127,10 @@ void gpuReorderAtoms(gpuContext gpu)

        // Select a bin for each molecule, then sort them by bin.

-        bool useHilbert = (numMolecules > 5000); // For small systems, a simple zigzag curve works better than a Hilbert curve.
+        bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
        float binWidth;
        if (useHilbert)
-        binWidth = max(max(maxx-minx, maxy-miny), maxz-minz)/255.0;
+            binWidth = max(max(maxx-minx, maxy-miny), maxz-minz)/255.0;
        else
            binWidth = 0.2*sqrt(gpu->sim.nonbondedCutoffSqr);
        int xbins = 1 + (int) ((maxx-minx)/binWidth);

--- a/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
@@ -140,12 +140,14 @@ void kCalculateCDLJForces(gpuContext gpu)
            }
            gpu->psInteractionCount->Download();
            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            if (gpu->bOutputBufferPerWarp)
                kCalculateCDLJCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            else
                kCalculateCDLJCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            LAUNCHERROR("kCalculateCDLJCutoffForces");
            break;
        case PERIODIC:
@@ -162,12 +164,14 @@ void kCalculateCDLJForces(gpuContext gpu)
            }
            gpu->psInteractionCount->Download();
            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            if (gpu->bOutputBufferPerWarp)
                kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            else
                kCalculateCDLJPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            LAUNCHERROR("kCalculateCDLJPeriodicForces");
    }
 }
\ No newline at end of file
--- a/platforms/cuda/src/kernels/kCalculateCDLJForces.h
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces.h
@@ -42,6 +42,9 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
    int pos = warp*numWorkUnits/totalWarps;
    int end = (warp+1)*numWorkUnits/totalWarps;
+#ifdef USE_CUTOFF
+    float3* tempBuffer = (float3*) &sA[cSim.nonbond_threads_per_block];
+#endif

    int lasty = -1;
    while (pos < end)
@@ -210,47 +213,142 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
            apos.w                 *= cSim.epsfac;
            if (!bExclusionFlag)
            {
-                for (unsigned int j = 0; j < GRID; j++)
+#ifdef USE_CUTOFF
+                unsigned int flags = cSim.pInteractionFlag[pos];
+                if (flags == 0)
                {
-                    dx              = psA[tj].x - apos.x;
-                    dy              = psA[tj].y - apos.y;
-                    dz              = psA[tj].z - apos.z;
+                    // No interactions in this block.
+                }
+                else if (flags == 0xFFFFFFFF)
+#endif
+                {
+                    // Compute all interactions within this block.
+
+                    for (unsigned int j = 0; j < GRID; j++)
+                    {
+                        dx              = psA[tj].x - apos.x;
+                        dy              = psA[tj].y - apos.y;
+                        dz              = psA[tj].z - apos.z;
 #ifdef USE_PERIODIC
-                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+                        dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                        dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                        dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
 #endif
-                    r2              = dx * dx + dy * dy + dz * dz;
-                    invR            = 1.0f / sqrt(r2);
-                    sig             = a.x + psA[tj].sig;
-                    sig2            = invR * sig;
-                    sig2           *= sig2;
-                    sig6            = sig2 * sig2 * sig2;
-                    eps             = a.y * psA[tj].eps;
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
+                        r2              = dx * dx + dy * dy + dz * dz;
+                        invR            = 1.0f / sqrt(r2);
+                        sig             = a.x + psA[tj].sig;
+                        sig2            = invR * sig;
+                        sig2           *= sig2;
+                        sig6            = sig2 * sig2 * sig2;
+                        eps             = a.y * psA[tj].eps;
+                        dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
 #ifdef USE_CUTOFF
-                    dEdR           += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+                        dEdR           += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
 #else
-                    dEdR           += apos.w * psA[tj].q * invR;
+                        dEdR           += apos.w * psA[tj].q * invR;
 #endif
-                    dEdR           *= invR * invR;
+                        dEdR           *= invR * invR;
 #ifdef USE_CUTOFF
-                    if (r2 > cSim.nonbondedCutoffSqr)
-                    {
-                        dEdR = 0.0f;
+                        if (r2 > cSim.nonbondedCutoffSqr)
+                        {
+                            dEdR = 0.0f;
+                        }
+#endif
+                        dx             *= dEdR;
+                        dy             *= dEdR;
+                        dz             *= dEdR;
+                        af.x           -= dx;
+                        af.y           -= dy;
+                        af.z           -= dz;
+                        psA[tj].fx     += dx;
+                        psA[tj].fy     += dy;
+                        psA[tj].fz     += dz;
+                        tj              = (tj + 1) & (GRID - 1);
                    }
+                }
+#ifdef USE_CUTOFF
+                else
+                {
+                    // Compute only a subset of the interactions in this block.
+
+                    for (unsigned int j = 0; j < GRID; j++)
+                    {
+                        if ((flags&(1<<j)) != 0)
+                        {
+                            dx              = psA[j].x - apos.x;
+                            dy              = psA[j].y - apos.y;
+                            dz              = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                            dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                            dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                            dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
 #endif
-                    dx             *= dEdR;
-                    dy             *= dEdR;
-                    dz             *= dEdR;
-                    af.x           -= dx;
-                    af.y           -= dy;
-                    af.z           -= dz;
-                    psA[tj].fx     += dx;
-                    psA[tj].fy     += dy;
-                    psA[tj].fz     += dz;
-                    tj              = (tj + 1) & (GRID - 1);
+                            r2              = dx * dx + dy * dy + dz * dz;
+                            invR            = 1.0f / sqrt(r2);
+                            sig             = a.x + psA[j].sig;
+                            sig2            = invR * sig;
+                            sig2           *= sig2;
+                            sig6            = sig2 * sig2 * sig2;
+                            eps             = a.y * psA[j].eps;
+                            dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
+#ifdef USE_CUTOFF
+                            dEdR           += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+#else
+                            dEdR           += apos.w * psA[j].q * invR;
+#endif
+                            dEdR           *= invR * invR;
+#ifdef USE_CUTOFF
+                            if (r2 > cSim.nonbondedCutoffSqr)
+                            {
+                                dEdR = 0.0f;
+                            }
+#endif
+                            dx             *= dEdR;
+                            dy             *= dEdR;
+                            dz             *= dEdR;
+                            af.x           -= dx;
+                            af.y           -= dy;
+                            af.z           -= dz;
+                            tempBuffer[threadIdx.x].x = dx;
+                            tempBuffer[threadIdx.x].y = dy;
+                            tempBuffer[threadIdx.x].z = dz;
+
+                            // Sum the forces on atom j.
+
+                            if (tgx % 2 == 0)
+                            {
+                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
+                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
+                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
+                            }
+                            if (tgx % 4 == 0)
+                            {
+                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
+                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
+                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
+                            }
+                            if (tgx % 8 == 0)
+                            {
+                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
+                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
+                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
+                            }
+                            if (tgx % 16 == 0)
+                            {
+                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
+                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
+                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
+                            }
+                            if (tgx == 0)
+                            {
+                                psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
+                                psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
+                                psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
+                            }
+                        }
+                    }
                }
+#endif
            }
            else  // bExclusion
            {

--- a/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
@@ -106,6 +106,8 @@ extern __global__ void kFindBlockBoundsCutoff_kernel();
 extern __global__ void kFindBlockBoundsPeriodic_kernel();
 extern __global__ void kFindBlocksWithInteractionsCutoff_kernel();
 extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
+extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*, int);
+extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*, int);

 void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
 {
@@ -145,19 +147,21 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
                exit(-1);
            }
            gpu->psInteractionCount->Download();
+            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            if (gpu->bRecalculateBornRadii)
            {
                kCalculateObcGbsaBornSum(gpu);
                kReduceObcGbsaBornSum(gpu);
            }
-            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
            if (gpu->bOutputBufferPerWarp)
                kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            else
                kCalculateCDLJObcGbsaCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
-            LAUNCHERROR("kCalculateCDLJCutoffForces");
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            LAUNCHERROR("kCalculateCDLJObcGbsaCutoffForces1");
            break;
        case PERIODIC:
            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
@@ -172,19 +176,21 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
                exit(-1);
            }
            gpu->psInteractionCount->Download();
+            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            if (gpu->bRecalculateBornRadii)
            {
                kCalculateObcGbsaBornSum(gpu);
                kReduceObcGbsaBornSum(gpu);
            }
-            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
            if (gpu->bOutputBufferPerWarp)
                kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            else
                kCalculateCDLJObcGbsaPeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
-            LAUNCHERROR("kCalculateCDLJPeriodicForces");
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            LAUNCHERROR("kCalculateCDLJObcGbsaPeriodicForces1");
            break;
    }
 }
--- a/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.h
+++ b/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.h
@@ -42,6 +42,9 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
    int pos = warp*numWorkUnits/totalWarps;
    int end = (warp+1)*numWorkUnits/totalWarps;
+#ifdef USE_CUTOFF
+    float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
+#endif

    int lasty = -1;
    while (pos < end)
@@ -241,63 +244,184 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
            apos.w                     *= cSim.epsfac;
            if (!bExclusionFlag)
            {
-                for (int j = 0; j < GRID; j++)
+#ifdef USE_CUTOFF
+                unsigned int flags = cSim.pInteractionFlag[pos];
+                if (flags == 0)
                {
-                    float dx                = psA[tj].x - apos.x;
-                    float dy                = psA[tj].y - apos.y;
-                    float dz                = psA[tj].z - apos.z;
+                    // No interactions in this block.
+                }
+                else if (flags == 0xFFFFFFFF)
+#endif
+                {
+                    // Compute all interactions within this block.
+
+                    for (int j = 0; j < GRID; j++)
+                    {
+                        float dx                = psA[tj].x - apos.x;
+                        float dy                = psA[tj].y - apos.y;
+                        float dz                = psA[tj].z - apos.z;
 #ifdef USE_PERIODIC
-                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+                        dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                        dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                        dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
 #endif
-                    float r2                = dx * dx + dy * dy + dz * dz;
+                        float r2                = dx * dx + dy * dy + dz * dz;

-                    // CDLJ part
-                    float invR              = 1.0f / sqrt(r2);
-                    float sig               = a.x + psA[tj].sig;
-                    float sig2              = invR * sig;
-                    sig2                   *= sig2;
-                    float sig6              = sig2 * sig2 * sig2;
-                    float eps               = a.y * psA[tj].eps;
-                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
+                        // CDLJ part
+                        float invR              = 1.0f / sqrt(r2);
+                        float sig               = a.x + psA[tj].sig;
+                        float sig2              = invR * sig;
+                        sig2                   *= sig2;
+                        float sig6              = sig2 * sig2 * sig2;
+                        float eps               = a.y * psA[tj].eps;
+                        float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
 #ifdef USE_CUTOFF
-                    dEdR                   += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+                        dEdR                   += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
 #else
-                    dEdR                   += apos.w * psA[tj].q * invR;
+                        dEdR                   += apos.w * psA[tj].q * invR;
 #endif
-                    dEdR                   *= invR * invR;
+                        dEdR                   *= invR * invR;

-                    // ObcGbsaForce1 part
-                    float alpha2_ij         = br * psA[tj].br;
-                    float D_ij              = r2 / (4.0f * alpha2_ij);
-                    float expTerm           = exp(-D_ij);
-                    float denominator2      = r2 + alpha2_ij * expTerm;
-                    float denominator       = sqrt(denominator2);
-                    float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
-                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
-                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;
-                    psA[tj].fb             += dGpol_dalpha2_ij * br;
-                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
+                        // ObcGbsaForce1 part
+                        float alpha2_ij         = br * psA[tj].br;
+                        float D_ij              = r2 / (4.0f * alpha2_ij);
+                        float expTerm           = exp(-D_ij);
+                        float denominator2      = r2 + alpha2_ij * expTerm;
+                        float denominator       = sqrt(denominator2);
+                        float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
+                        float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
+                        af.w                   += dGpol_dalpha2_ij * psA[tj].br;
+                        psA[tj].fb             += dGpol_dalpha2_ij * br;
+                        dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
 #ifdef USE_CUTOFF
-                    if (r2 > cSim.nonbondedCutoffSqr)
-                    {
-                        dEdR = 0.0f;
+                        if (r2 > cSim.nonbondedCutoffSqr)
+                        {
+                            dEdR = 0.0f;
+                        }
+#endif
+
+                        // Add forces
+                        dx                     *= dEdR;
+                        dy                     *= dEdR;
+                        dz                     *= dEdR;
+                        af.x                   -= dx;
+                        af.y                   -= dy;
+                        af.z                   -= dz;
+                        psA[tj].fx             += dx;
+                        psA[tj].fy             += dy;
+                        psA[tj].fz             += dz;
+                        tj                      = (tj + 1) & (GRID - 1);
                    }
+                }
+#ifdef USE_CUTOFF
+                else
+                {
+                    // Compute only a subset of the interactions in this block.
+
+                    for (int j = 0; j < GRID; j++)
+                    {
+                        if ((flags&(1<<j)) != 0)
+                        {
+                            float dx                = psA[j].x - apos.x;
+                            float dy                = psA[j].y - apos.y;
+                            float dz                = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                            dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                            dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                            dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
 #endif
+                            float r2                = dx * dx + dy * dy + dz * dz;

-                    // Add forces
-                    dx                     *= dEdR;
-                    dy                     *= dEdR;
-                    dz                     *= dEdR;
-                    af.x                   -= dx;
-                    af.y                   -= dy;
-                    af.z                   -= dz;
-                    psA[tj].fx             += dx;
-                    psA[tj].fy             += dy;
-                    psA[tj].fz             += dz;
-                    tj                      = (tj + 1) & (GRID - 1);
+                            // CDLJ part
+                            float invR              = 1.0f / sqrt(r2);
+                            float sig               = a.x + psA[j].sig;
+                            float sig2              = invR * sig;
+                            sig2                   *= sig2;
+                            float sig6              = sig2 * sig2 * sig2;
+                            float eps               = a.y * psA[j].eps;
+                            float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
+#ifdef USE_CUTOFF
+                            dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+#else
+                            dEdR                   += apos.w * psA[j].q * invR;
+#endif
+                            dEdR                   *= invR * invR;
+
+                            // ObcGbsaForce1 part
+                            float alpha2_ij         = br * psA[j].br;
+                            float D_ij              = r2 / (4.0f * alpha2_ij);
+                            float expTerm           = exp(-D_ij);
+                            float denominator2      = r2 + alpha2_ij * expTerm;
+                            float denominator       = sqrt(denominator2);
+                            float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
+                            float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
+                            af.w                   += dGpol_dalpha2_ij * psA[j].br;
+                            dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
+
+                            // Sum the Born forces.
+
+                            tempBuffer[threadIdx.x] = dGpol_dalpha2_ij * br;
+                            if (tgx % 2 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
+                            if (tgx % 4 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
+                            if (tgx % 8 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
+                            if (tgx % 16 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
+                            if (tgx == 0)
+                                psA[j].fb += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
+#ifdef USE_CUTOFF
+                            if (r2 > cSim.nonbondedCutoffSqr)
+                            {
+                                dEdR = 0.0f;
+                            }
+#endif
+
+                            // Add forces
+                            dx                     *= dEdR;
+                            dy                     *= dEdR;
+                            dz                     *= dEdR;
+                            af.x                   -= dx;
+                            af.y                   -= dy;
+                            af.z                   -= dz;
+                            tempBuffer[threadIdx.x] = dx;
+                            if (tgx % 2 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
+                            if (tgx % 4 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
+                            if (tgx % 8 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
+                            if (tgx % 16 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
+                            if (tgx == 0)
+                                psA[j].fx += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
+                            tempBuffer[threadIdx.x] = dy;
+                            if (tgx % 2 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
+                            if (tgx % 4 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
+                            if (tgx % 8 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
+                            if (tgx % 16 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
+                            if (tgx == 0)
+                                psA[j].fy += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
+                            tempBuffer[threadIdx.x] = dz;
+                            if (tgx % 2 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
+                            if (tgx % 4 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
+                            if (tgx % 8 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
+                            if (tgx % 16 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
+                            if (tgx == 0)
+                                psA[j].fz += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
+                        }
+                    }
                }
+#endif
            }
            else  // bExclusion
            {

--- a/platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
@@ -187,19 +187,19 @@ void kCalculateObcGbsaBornSum(gpuContext gpu)
            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
            if (gpu->bOutputBufferPerWarp)
                kCalculateObcGbsaCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            else
                kCalculateObcGbsaCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            break;
        case PERIODIC:
            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
            if (gpu->bOutputBufferPerWarp)
                kCalculateObcGbsaPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            else
                kCalculateObcGbsaPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            break;
    }
    LAUNCHERROR("kCalculateBornSum");

--- a/platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.h
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.h
@@ -43,6 +43,9 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor
 #ifdef USE_OUTPUT_BUFFER_PER_WARP
    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
 #endif
+#ifdef USE_CUTOFF
+    float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
+#endif

    while (pos >= 0)
    {
@@ -107,8 +110,8 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor
                                         (0.50f * rInverse * ratio) +
                                         (0.25f * psA[j].sr * psA[j].sr * rInverse) *
                                         (l_ij2 - u_ij2);
-
-                        if (ar.x < (psA[j].r - r))
+                        float rj = psA[j].r;
+                        if (ar.x < (rj - r))
                        {
                            apos.w += 2.0f * ((1.0f / ar.x) - l_ij);
                        }
@@ -142,69 +145,169 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* wor
            sA[threadIdx.x].sr              = temp1.y;
            sA[threadIdx.x].sum = apos.w    = 0.0f;

-            for (unsigned int j = 0; j < GRID; j++)
+#ifdef USE_CUTOFF
+            unsigned int flags = cSim.pInteractionFlag[pos + (blockIdx.x*workUnits)/gridDim.x];
+            if (flags == 0)
+            {
+                // No interactions in this block.
+            }
+            else if (flags == 0xFFFFFFFF)
+#endif
            {
-                dx                      = psA[tj].x - apos.x;
-                dy                      = psA[tj].y - apos.y;
-                dz                      = psA[tj].z - apos.z;
+                // Compute all interactions within this block.
+
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    dx                      = psA[tj].x - apos.x;
+                    dy                      = psA[tj].y - apos.y;
+                    dz                      = psA[tj].z - apos.z;
 #ifdef USE_PERIODIC
-                dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
 #endif
-                r2                      = dx * dx + dy * dy + dz * dz;
+                    r2                      = dx * dx + dy * dy + dz * dz;
 #ifdef USE_PERIODIC
-                if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
+                    if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
 #elif defined USE_CUTOFF
-                if (r2 < cSim.nonbondedCutoffSqr)
+                    if (r2 < cSim.nonbondedCutoffSqr)
 #endif
-                {
-                    r                       = sqrt(r2);
-                    float rInverse          = 1.0f / r;
-                    float rScaledRadiusJ    = r + psA[tj].sr;
-                    if (ar.x < rScaledRadiusJ)
                    {
-                        float l_ij     = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
-                        float u_ij     = 1.0f / rScaledRadiusJ;
-                        float l_ij2    = l_ij * l_ij;
-                        float u_ij2    = u_ij * u_ij;
-                        float ratio    = log(u_ij / l_ij);
-                        float term     = l_ij -
-                                         u_ij +
-                                         0.25f * r * (u_ij2 - l_ij2) +
-                                         (0.50f * rInverse * ratio) +
-                                         (0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
-                                         (l_ij2 - u_ij2);
-                        if (ar.x < (psA[tj].sr - r))
+                        r                       = sqrt(r2);
+                        float rInverse          = 1.0f / r;
+                        float rScaledRadiusJ    = r + psA[tj].sr;
+                        if (ar.x < rScaledRadiusJ)
                        {
-                            term += 2.0f * ((1.0f / ar.x) - l_ij);
+                            float l_ij     = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
+                            float u_ij     = 1.0f / rScaledRadiusJ;
+                            float l_ij2    = l_ij * l_ij;
+                            float u_ij2    = u_ij * u_ij;
+                            float ratio    = log(u_ij / l_ij);
+                            float term     = l_ij -
+                                             u_ij +
+                                             0.25f * r * (u_ij2 - l_ij2) +
+                                             (0.50f * rInverse * ratio) +
+                                             (0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
+                                             (l_ij2 - u_ij2);
+                            float srj = psA[tj].sr;
+                            if (ar.x < (srj - r))
+                            {
+                                term += 2.0f * ((1.0f / ar.x) - l_ij);
+                            }
+                            apos.w        += term;
+                        }
+                        float rScaledRadiusI    = r + ar.y;
+                        if (psA[tj].r < rScaledRadiusI)
+                        {
+                            float l_ij     = 1.0f / max(psA[tj].r, fabs(r - ar.y));
+                            float u_ij     = 1.0f / rScaledRadiusI;
+                            float l_ij2    = l_ij * l_ij;
+                            float u_ij2    = u_ij * u_ij;
+                            float ratio    = log(u_ij / l_ij);
+                            float term     = l_ij -
+                                             u_ij +
+                                             0.25f * r * (u_ij2 - l_ij2) +
+                                             (0.50f * rInverse * ratio) +
+                                             (0.25f * ar.y * ar.y * rInverse) *
+                                             (l_ij2 - u_ij2);
+                            float rj = psA[tj].r;
+                            if (rj < (ar.y - r))
+                            {
+                                term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
+                            }
+                            psA[tj].sum    += term;
                        }
-                        apos.w        += term;
                    }
-                    float rScaledRadiusI    = r + ar.y;
-                    if (psA[tj].r < rScaledRadiusI)
-                    {
-                        float l_ij     = 1.0f / max(psA[tj].r, fabs(r - ar.y));
-                        float u_ij     = 1.0f / rScaledRadiusI;
-                        float l_ij2    = l_ij * l_ij;
-                        float u_ij2    = u_ij * u_ij;
-                        float ratio    = log(u_ij / l_ij);
-                        float term     = l_ij -
-                                         u_ij +
-                                         0.25f * r * (u_ij2 - l_ij2) +
-                                         (0.50f * rInverse * ratio) +
-                                         (0.25f * ar.y * ar.y * rInverse) *
-                                         (l_ij2 - u_ij2);
+                    tj = (tj - 1) & (GRID - 1);
+                }
+            }
+#ifdef USE_CUTOFF
+            else
+            {
+                // Compute only a subset of the interactions in this block.

-                        if (psA[tj].r < (ar.y - r))
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    if ((flags&(1<<j)) != 0)
+                    {
+                        tempBuffer[threadIdx.x] = 0.0f;
+                        dx                      = psA[j].x - apos.x;
+                        dy                      = psA[j].y - apos.y;
+                        dz                      = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                        dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                        dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                        dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                        r2                      = dx * dx + dy * dy + dz * dz;
+#ifdef USE_PERIODIC
+                        if (i < cSim.atoms && y+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
+#elif defined USE_CUTOFF
+                        if (r2 < cSim.nonbondedCutoffSqr)
+#endif
                        {
-                            term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
+                            r                       = sqrt(r2);
+                            float rInverse          = 1.0f / r;
+                            float rScaledRadiusJ    = r + psA[j].sr;
+                            if (ar.x < rScaledRadiusJ)
+                            {
+                                float l_ij     = 1.0f / max(ar.x, fabs(r - psA[j].sr));
+                                float u_ij     = 1.0f / rScaledRadiusJ;
+                                float l_ij2    = l_ij * l_ij;
+                                float u_ij2    = u_ij * u_ij;
+                                float ratio    = log(u_ij / l_ij);
+                                float term     = l_ij -
+                                                 u_ij +
+                                                 0.25f * r * (u_ij2 - l_ij2) +
+                                                 (0.50f * rInverse * ratio) +
+                                                 (0.25f * psA[j].sr * psA[j].sr * rInverse) *
+                                                 (l_ij2 - u_ij2);
+                                float srj = psA[j].sr;
+                                if (ar.x < (srj - r))
+                                {
+                                    term += 2.0f * ((1.0f / ar.x) - l_ij);
+                                }
+                                apos.w        += term;
+                            }
+                            float rScaledRadiusI    = r + ar.y;
+                            if (psA[j].r < rScaledRadiusI)
+                            {
+                                float l_ij     = 1.0f / max(psA[j].r, fabs(r - ar.y));
+                                float u_ij     = 1.0f / rScaledRadiusI;
+                                float l_ij2    = l_ij * l_ij;
+                                float u_ij2    = u_ij * u_ij;
+                                float ratio    = log(u_ij / l_ij);
+                                float term     = l_ij -
+                                                 u_ij +
+                                                 0.25f * r * (u_ij2 - l_ij2) +
+                                                 (0.50f * rInverse * ratio) +
+                                                 (0.25f * ar.y * ar.y * rInverse) *
+                                                 (l_ij2 - u_ij2);
+                                float rj = psA[j].r;
+                                if (rj < (ar.y - r))
+                                {
+                                    term += 2.0f * ((1.0f / psA[j].r) - l_ij);
+                                }
+                                tempBuffer[threadIdx.x] = term;
+                            }
                        }
-                        psA[tj].sum    += term;
+
+                        // Sum the terms.
+
+                        if (tgx % 2 == 0)
+                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
+                        if (tgx % 4 == 0)
+                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
+                        if (tgx % 8 == 0)
+                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
+                        if (tgx % 16 == 0)
+                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
+                        if (tgx == 0)
+                            psA[j].sum += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
                    }
                }
-                tj = (tj - 1) & (GRID - 1);
            }
+#endif

            // Write results
 #ifdef USE_OUTPUT_BUFFER_PER_WARP

--- a/platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
@@ -47,7 +47,6 @@ struct Atom {
    float z;
    float r;
    float sr;
-    float sr2;
    float fx;
    float fy;
    float fz;
@@ -123,19 +122,19 @@ void kCalculateObcGbsaForces2(gpuContext gpu)
            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
            if (gpu->bOutputBufferPerWarp)
                kCalculateObcGbsaCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            else
                kCalculateObcGbsaCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            break;
        case PERIODIC:
            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
            if (gpu->bOutputBufferPerWarp)
                kCalculateObcGbsaPeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            else
                kCalculateObcGbsaPeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            break;
    }
    LAUNCHERROR("kCalculateObcGbsaForces2");

--- a/platforms/cuda/src/kernels/kCalculateObcGbsaForces2.h
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaForces2.h
@@ -38,10 +38,13 @@
 __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit, int numWorkUnits)
 {
    extern __shared__ Atom sA[];
-    unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
+    unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
    int pos = warp*numWorkUnits/totalWarps;
    int end = (warp+1)*numWorkUnits/totalWarps;
+#ifdef USE_CUTOFF
+    float3* tempBuffer = (float3*) &sA[cSim.bornForce2_threads_per_block];
+#endif

    int lasty = -1;
    while (pos < end)
@@ -72,7 +75,6 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
            sA[threadIdx.x].z           = apos.z;
            sA[threadIdx.x].r           = a.x;
            sA[threadIdx.x].sr          = a.y;
-            sA[threadIdx.x].sr2         = a.y * a.y;
            sA[threadIdx.x].fb          = fb;

            for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1))
@@ -105,7 +107,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor

                // Born Forces term
                float term          =  0.125f *
-                                      (1.000f + psA[j].sr2 * r2Inverse) * t3 +
+                                      (1.000f + psA[j].sr * psA[j].sr * r2Inverse) * t3 +
                                       0.250f * t1 * r2Inverse;
                float dE            = fb * term;

@@ -162,98 +164,242 @@ __global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* wor
                sA[threadIdx.x].z           = temp.z;
                sA[threadIdx.x].r           = temp1.x;
                sA[threadIdx.x].sr          = temp1.y;
-                sA[threadIdx.x].sr2         = temp1.y * temp1.y;
            }
            float sr2                   = a.y * a.y;
-            for (int j = 0; j < GRID; j++)
+#ifdef USE_CUTOFF
+            unsigned int flags = cSim.pInteractionFlag[pos];
+            if (flags == 0)
+            {
+                // No interactions in this block.
+            }
+            else if (flags == 0xFFFFFFFF)
+#endif
            {
-                float dx                = psA[tj].x - apos.x;
-                float dy                = psA[tj].y - apos.y;
-                float dz                = psA[tj].z - apos.z;
+                // Compute all interactions within this block.
+
+                for (int j = 0; j < GRID; j++)
+                {
+                    float dx                = psA[tj].x - apos.x;
+                    float dy                = psA[tj].y - apos.y;
+                    float dz                = psA[tj].z - apos.z;
 #ifdef USE_PERIODIC
-                dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
 #endif
-                float r2                = dx * dx + dy * dy + dz * dz;
-                float r                 = sqrt(r2);
+                    float r2                = dx * dx + dy * dy + dz * dz;
+                    float r                 = sqrt(r2);

-                // Interleaved Atom I and J Born Forces and sum components
-                float r2Inverse         = 1.0f / r2;
-                float rScaledRadiusJ    = r + psA[tj].sr;
-                float rScaledRadiusI    = r + a.y;
-                float rInverse          = 1.0f / r;
-                float l_ijJ             = 1.0f / max(a.x, fabs(r - psA[tj].sr));
-                float l_ijI             = 1.0f / max(psA[tj].r, fabs(r - a.y));
-                float u_ijJ             = 1.0f / rScaledRadiusJ;
-                float u_ijI             = 1.0f / rScaledRadiusI;
-                float l_ij2J            = l_ijJ * l_ijJ;
-                float l_ij2I            = l_ijI * l_ijI;
-                float u_ij2J            = u_ijJ * u_ijJ;
-                float u_ij2I            = u_ijI * u_ijI;
-                float t1J               = log (u_ijJ / l_ijJ);
-                float t1I               = log (u_ijI / l_ijI);
-                float t2J               = (l_ij2J - u_ij2J);
-                float t2I               = (l_ij2I - u_ij2I);
-                float t3J               = t2J * rInverse;
-                float t3I               = t2I * rInverse;
-                t1J                    *= rInverse;
-                t1I                    *= rInverse;
+                    // Interleaved Atom I and J Born Forces and sum components
+                    float r2Inverse         = 1.0f / r2;
+                    float rScaledRadiusJ    = r + psA[tj].sr;
+                    float rScaledRadiusI    = r + a.y;
+                    float rInverse          = 1.0f / r;
+                    float l_ijJ             = 1.0f / max(a.x, fabs(r - psA[tj].sr));
+                    float l_ijI             = 1.0f / max(psA[tj].r, fabs(r - a.y));
+                    float u_ijJ             = 1.0f / rScaledRadiusJ;
+                    float u_ijI             = 1.0f / rScaledRadiusI;
+                    float l_ij2J            = l_ijJ * l_ijJ;
+                    float l_ij2I            = l_ijI * l_ijI;
+                    float u_ij2J            = u_ijJ * u_ijJ;
+                    float u_ij2I            = u_ijI * u_ijI;
+                    float t1J               = log (u_ijJ / l_ijJ);
+                    float t1I               = log (u_ijI / l_ijI);
+                    float t2J               = (l_ij2J - u_ij2J);
+                    float t2I               = (l_ij2I - u_ij2I);
+                    float t3J               = t2J * rInverse;
+                    float t3I               = t2I * rInverse;
+                    t1J                    *= rInverse;
+                    t1I                    *= rInverse;

-                // Born Forces term
-                float term              =  0.125f *
-                                          (1.000f + psA[tj].sr2 * r2Inverse) * t3J +
-                                           0.250f * t1J * r2Inverse;
-                float dE                = fb * term;
+                    // Born Forces term
+                    float term              =  0.125f *
+                                              (1.000f + psA[tj].sr * psA[tj].sr * r2Inverse) * t3J +
+                                               0.250f * t1J * r2Inverse;
+                    float dE                = fb * term;

 #if defined USE_PERIODIC
-                if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+                    if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
 #elif defined USE_CUTOFF
-                if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
+                    if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
 #else
-                if (a.x >= rScaledRadiusJ)
+                    if (a.x >= rScaledRadiusJ)
 #endif
-                {
-                    dE                  = 0.0f;
-                }
+                    {
+                        dE                  = 0.0f;
+                    }

-                float d                 = dx * dE;
-                af.x                   -= d;
-                psA[tj].fx             += d;
-                d                       = dy * dE;
-                af.y                   -= d;
-                psA[tj].fy             += d;
-                d                       = dz * dE;
-                af.z                   -= d;
-                psA[tj].fz             += d;
+                    float d                 = dx * dE;
+                    af.x                   -= d;
+                    psA[tj].fx             += d;
+                    d                       = dy * dE;
+                    af.y                   -= d;
+                    psA[tj].fy             += d;
+                    d                       = dz * dE;
+                    af.z                   -= d;
+                    psA[tj].fz             += d;

-                // Atom J Born sum term
-                term                    =  0.125f *
-                                          (1.000f + sr2 * r2Inverse) * t3I +
-                                           0.250f * t1I * r2Inverse;
-                dE                      = psA[tj].fb * term;
+                    // Atom J Born sum term
+                    term                    =  0.125f *
+                                              (1.000f + sr2 * r2Inverse) * t3I +
+                                               0.250f * t1I * r2Inverse;
+                    dE                      = psA[tj].fb * term;

+                    float rj = psA[tj].r;
 #ifdef USE_PERIODIC
-                if (psA[tj].r >= rScaledRadiusI || i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+                    if (rj >= rScaledRadiusI || i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
 #elif defined USE_CUTOFF
-                if (psA[tj].r >= rScaledRadiusI || r2 > cSim.nonbondedCutoffSqr)
+                    if (rj >= rScaledRadiusI || r2 > cSim.nonbondedCutoffSqr)
 #else
-                if (psA[tj].r >= rScaledRadiusI)
+                    if (rj >= rScaledRadiusI)
 #endif
+                    {
+                        dE                  = 0.0f;
+                    }
+                    dx                     *= dE;
+                    dy                     *= dE;
+                    dz                     *= dE;
+                    psA[tj].fx             += dx;
+                    psA[tj].fy             += dy;
+                    psA[tj].fz             += dz;
+                    af.x                   -= dx;
+                    af.y                   -= dy;
+                    af.z                   -= dz;
+                    tj                      = (tj + 1) & (GRID - 1);
+                }
+            }
+#ifdef USE_CUTOFF
+            else
+            {
+                // Compute only a subset of the interactions in this block.
+
+                for (int j = 0; j < GRID; j++)
                {
-                    dE                  = 0.0f;
+                    if ((flags&(1<<j)) != 0)
+                    {
+                        float dx                = psA[j].x - apos.x;
+                        float dy                = psA[j].y - apos.y;
+                        float dz                = psA[j].z - apos.z;
+    #ifdef USE_PERIODIC
+                        dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                        dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                        dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+    #endif
+                        float r2                = dx * dx + dy * dy + dz * dz;
+                        float r                 = sqrt(r2);
+
+                        // Interleaved Atom I and J Born Forces and sum components
+                        float r2Inverse         = 1.0f / r2;
+                        float rScaledRadiusJ    = r + psA[j].sr;
+                        float rScaledRadiusI    = r + a.y;
+                        float rInverse          = 1.0f / r;
+                        float l_ijJ             = 1.0f / max(a.x, fabs(r - psA[j].sr));
+                        float l_ijI             = 1.0f / max(psA[j].r, fabs(r - a.y));
+                        float u_ijJ             = 1.0f / rScaledRadiusJ;
+                        float u_ijI             = 1.0f / rScaledRadiusI;
+                        float l_ij2J            = l_ijJ * l_ijJ;
+                        float l_ij2I            = l_ijI * l_ijI;
+                        float u_ij2J            = u_ijJ * u_ijJ;
+                        float u_ij2I            = u_ijI * u_ijI;
+                        float t1J               = log (u_ijJ / l_ijJ);
+                        float t1I               = log (u_ijI / l_ijI);
+                        float t2J               = (l_ij2J - u_ij2J);
+                        float t2I               = (l_ij2I - u_ij2I);
+                        float t3J               = t2J * rInverse;
+                        float t3I               = t2I * rInverse;
+                        t1J                    *= rInverse;
+                        t1I                    *= rInverse;
+
+                        // Born Forces term
+                        float term              =  0.125f *
+                                                  (1.000f + psA[j].sr * psA[j].sr * r2Inverse) * t3J +
+                                                   0.250f * t1J * r2Inverse;
+                        float dE                = fb * term;
+
+    #if defined USE_PERIODIC
+                        if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+    #elif defined USE_CUTOFF
+                        if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
+    #else
+                        if (a.x >= rScaledRadiusJ)
+    #endif
+                        {
+                            dE                  = 0.0f;
+                        }
+
+                        float d                 = dx * dE;
+                        af.x                   -= d;
+                        tempBuffer[threadIdx.x].x = d;
+                        d                       = dy * dE;
+                        af.y                   -= d;
+                        tempBuffer[threadIdx.x].y = d;
+                        d                       = dz * dE;
+                        af.z                   -= d;
+                        tempBuffer[threadIdx.x].z = d;
+
+                        // Atom J Born sum term
+                        term                    =  0.125f *
+                                                  (1.000f + sr2 * r2Inverse) * t3I +
+                                                   0.250f * t1I * r2Inverse;
+                        dE                      = psA[j].fb * term;
+
+                        float rj = psA[j].r;
+    #ifdef USE_PERIODIC
+                        if (rj >= rScaledRadiusI || i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+    #elif defined USE_CUTOFF
+                        if (rj >= rScaledRadiusI || r2 > cSim.nonbondedCutoffSqr)
+    #else
+                        if (rj >= rScaledRadiusI)
+    #endif
+                        {
+                            dE                  = 0.0f;
+                        }
+                        dx                     *= dE;
+                        dy                     *= dE;
+                        dz                     *= dE;
+                        tempBuffer[threadIdx.x].x += dx;
+                        tempBuffer[threadIdx.x].y += dy;
+                        tempBuffer[threadIdx.x].z += dz;
+                        af.x                   -= dx;
+                        af.y                   -= dy;
+                        af.z                   -= dz;
+
+                        // Sum the forces on atom j.
+
+                        if (tgx % 2 == 0)
+                        {
+                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
+                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
+                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
+                        }
+                        if (tgx % 4 == 0)
+                        {
+                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
+                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
+                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
+                        }
+                        if (tgx % 8 == 0)
+                        {
+                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
+                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
+                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
+                        }
+                        if (tgx % 16 == 0)
+                        {
+                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
+                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
+                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
+                        }
+                        if (tgx == 0)
+                        {
+                            psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
+                            psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
+                            psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
+                        }
+                    }
                }
-                dx                     *= dE;
-                dy                     *= dE;
-                dz                     *= dE;
-                psA[tj].fx             += dx;
-                psA[tj].fy             += dy;
-                psA[tj].fz             += dz;
-                af.x                   -= dx;
-                af.y                   -= dy;
-                af.z                   -= dz;
-                tj                      = (tj + 1) & (GRID - 1);
            }
+#endif

            // Write results
            float4 of;

--- a/platforms/cuda/src/kernels/kFindInteractingBlocks.h
+++ b/platforms/cuda/src/kernels/kFindInteractingBlocks.h
@@ -112,3 +112,88 @@ __global__ void METHOD_NAME(kFindBlocksWithInteractions, _kernel)()
        cSim.pInteractionFlag[pos] = (dx*dx+dy*dy+dz*dz > cSim.nonbondedCutoffSqr ? 0 : 1);
    }
 }
+
+/**
+ * Compare each atom in one block to the bounding box of another block, and set
+ * flags for which ones are interacting.
+ */
+__global__ void METHOD_NAME(kFindInteractionsWithinBlocks, _kernel)(unsigned int* workUnit, int numWorkUnits)
+{
+    extern __shared__ unsigned int flags[];
+    unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    int pos = warp*numWorkUnits/totalWarps;
+    int end = (warp+1)*numWorkUnits/totalWarps;
+    unsigned int index = threadIdx.x & (GRID - 1);
+
+    int lasty = -1;
+    float4 apos;
+    while (pos < end)
+    {
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x = workUnit[pos];
+        unsigned int y = ((x >> 2) & 0x7fff);
+        bool bExclusionFlag = (x & 0x1);
+        x = (x >> 17);
+        if (x == y || bExclusionFlag)
+        {
+            // Assume this block will be dense.
+
+            if (index == 0)
+                cSim.pInteractionFlag[pos] = 0xFFFFFFFF;
+        }
+        else
+        {
+            // Load the bounding box for x and the atom positions for y.
+
+            float4 center = cSim.pGridCenter[x];
+            float4 boxSize = cSim.pGridBoundingBox[x];
+            if (y != lasty)
+            {
+                apos = cSim.pPosq[(y<<GRIDBITS)+index];
+            }
+
+            // Find the distance of the atom from the bounding box.
+
+            float dx = apos.x-center.x;
+            float dy = apos.y-center.y;
+            float dz = apos.z-center.z;
+#ifdef USE_PERIODIC
+            dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+            dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+            dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+            dx = max(0.0f, abs(dx)-boxSize.x);
+            dy = max(0.0f, abs(dy)-boxSize.y);
+            dz = max(0.0f, abs(dz)-boxSize.z);
+            flags[threadIdx.x] = (dx*dx+dy*dy+dz*dz > cSim.nonbondedCutoffSqr ? 0 : 1 << index);
+
+            // Sum the flags.
+
+            if (index % 2 == 0)
+                flags[threadIdx.x] += flags[threadIdx.x+1];
+            if (index % 4 == 0)
+                flags[threadIdx.x] += flags[threadIdx.x+2];
+            if (index % 8 == 0)
+                flags[threadIdx.x] += flags[threadIdx.x+4];
+            if (index % 16 == 0)
+                flags[threadIdx.x] += flags[threadIdx.x+8];
+            if (index == 0)
+            {
+                unsigned int allFlags = flags[threadIdx.x] + flags[threadIdx.x+16];
+
+                // Count how many flags are set, and based on that decide whether to compute all interactions
+                // or only a fraction of them.
+
+                unsigned int bits = (allFlags&0x55555555) + ((allFlags>>1)&0x55555555);
+                bits = (bits&0x33333333) + ((bits>>2)&0x33333333);
+                bits = (bits&0x0F0F0F0F) + ((bits>>4)&0x0F0F0F0F);
+                bits = (bits&0x00FF00FF) + ((bits>>8)&0x00FF00FF);
+                bits = (bits&0x0000FFFF) + ((bits>>16)&0x0000FFFF);
+                cSim.pInteractionFlag[pos] = (bits > 12 ? 0xFFFFFFFF : allFlags);
+            }
+            lasty = y;
+        }
+        pos++;
+    }
+}
--- a/platforms/cuda/tests/TestCudaNonbondedForce.cpp
+++ b/platforms/cuda/tests/TestCudaNonbondedForce.cpp
@@ -497,6 +497,36 @@ void testBlockInteractions(bool periodic) {
        dy = max(0.0f, abs(dy)-gridSize1.y-gridSize2.y);
        dz = max(0.0f, abs(dz)-gridSize1.z-gridSize2.z);
        ASSERT(sqrt(dx*dx+dy*dy+dz*dz) < cutoff+TOL);
+
+        // Check the interaction flags.
+
+        unsigned int flags = data.gpu->psInteractionFlag->_pSysData[i];
+        for (int atom2 = 0; atom2 < 32; atom2++) {
+            if ((flags & 1) == 0) {
+                float4 pos2 = data.gpu->psPosq4->_pSysData[y*blockSize+atom2];
+                for (int atom1 = 0; atom1 < blockSize; ++atom1) {
+                    float4 pos1 = data.gpu->psPosq4->_pSysData[x*blockSize+atom1];
+                    float dx = pos2.x-pos1.x;
+                    float dy = pos2.y-pos1.y;
+                    float dz = pos2.z-pos1.z;
+                    if (periodic) {
+                        dx -= floor(0.5+dx/boxSize)*boxSize;
+                        dy -= floor(0.5+dy/boxSize)*boxSize;
+                        dz -= floor(0.5+dz/boxSize)*boxSize;
+                    }
+                    if (dx*dx+dy*dy+dz*dz < cutoff*cutoff) {
+                        dx = pos2.x-center1.x;
+                        dy = pos2.y-center1.y;
+                        dz = pos2.z-center1.z;
+                        dx = max(0.0f, abs(dx)-gridSize1.x);
+                        dy = max(0.0f, abs(dy)-gridSize1.y);
+                        dz = max(0.0f, abs(dz)-gridSize1.z);
+                    }
+                    ASSERT(dx*dx+dy*dy+dz*dz > cutoff*cutoff);
+                }
+            }
+            flags >>= 1;
+        }
    }

    // Check the tiles that did not have interactions to make sure all atoms are beyond the cutoff.