Threads per block for GB/VI kernels take into account required shared memory

Padded atom entries in Born radii memory now initialized to zero launch bounds guards now included for several kernels

Threads per block for GB/VI kernels take into account required shared memory
Padded atom entries in Born radii memory now initialized to zero launch bounds guards now included for several kernels
09c1460d · Mark Friedrichs · dbea5152 · 09c1460d · 09c1460d · 09c1460d
Commit 09c1460d authored Oct 25, 2011 by Mark Friedrichs
12 changed files
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/freeEnergyCudaGpu.cpp
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/freeEnergyCudaGpu.cpp
@@ -255,6 +255,29 @@ void freeEnergyGpuSetConstants( freeEnergyGpuContext freeEnergyGpu ){

 }

+/**---------------------------------------------------------------------------------------
+
+   Get threads/block
+
+   @param amoebaGpu              amoebaGpuContext
+   @param sharedMemoryPerThread  shared memory/thread
+   @param sharedMemoryPerBlock   shared memory/block
+
+   @return threadsPerBlock
+
+   --------------------------------------------------------------------------------------- */
+
+extern "C"
+unsigned int getThreadsPerBlockFEP( freeEnergyGpuContext freeEnergyGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock )
+{
+    unsigned int grid               = freeEnergyGpu->gpuContext->grid;
+    unsigned int threadsPerBlock    = (sharedMemoryPerBlock + grid -1)/(grid*sharedMemoryPerThread);
+    threadsPerBlock                 = threadsPerBlock < 1 ? 1 : threadsPerBlock;
+    threadsPerBlock                *= grid;
+
+   return threadsPerBlock;
+}
+
 static int decodeCell( int cellCode, unsigned int* x, unsigned int* y, unsigned int* exclusion ){
    *x         =  cellCode >> 17; 
    *y         = (cellCode >> 2 ) & 0x7FFF;
@@ -275,8 +298,9 @@ void showWorkUnitsFreeEnergy( freeEnergyGpuContext freeEnergyGpu, int interactin
    //unsigned int numWorkUnits    = cSim.pInteractionCount[0];
    unsigned int numWorkUnits    = gpu->psInteractionCount->_pSysData[0];

-    (void) fprintf( stderr, "Total warps=%u blocks=%u threads=%u GRID=%u wus=%u\n",
-                    totalWarps, gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, GRID, numWorkUnits );
+    (void) fprintf( stderr, "Total warps=%u blocks=%u threads=%u GRID=%u wus=%u warpOutput=%d\n",
+                    totalWarps, gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, GRID, 
+                    numWorkUnits, gpu->bOutputBufferPerWarp );

    unsigned int maxPrint = 3;
    std::stringstream message;
@@ -309,8 +333,13 @@ void showWorkUnitsFreeEnergy( freeEnergyGpuContext freeEnergyGpu, int interactin

                x                   *= GRID;
                y                   *= GRID;
+                unsigned int bufferX = gpu->bOutputBufferPerWarp ? warp*gpu->sim.stride : (x >> GRIDBITS) * gpu->sim.stride;
+                unsigned int bufferY = gpu->bOutputBufferPerWarp ? warp*gpu->sim.stride : (y >> GRIDBITS) * gpu->sim.stride;
+                bufferX             /= gpu->sim.paddedNumberOfAtoms;
+                bufferY             /= gpu->sim.paddedNumberOfAtoms;
                if( jj == 1 ){
-                     (void) sprintf( buffer, "Block %4u thread %4u warp=%4u pos[%4u %4u] ", ii, jj, warp, pos, end );
+                //if( bufferX >= 62 || bufferY >= 62 ){
+                     (void) sprintf( buffer, "Block %4u thread %4u warp=%4u pos[%4u %4u] bufXY[%4u %4u]", ii, jj, warp, pos, end, bufferX, bufferY );
                     message << buffer;
                     (void) sprintf( buffer, "    x[%4u %4u] y[%4u %4u] excl=%u", x, x+32, y, y+32, exclusion );
                     message << buffer;

--- a/plugins/freeEnergy/platforms/cuda/src/kernels/freeEnergyGpuTypes.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/freeEnergyGpuTypes.h
@@ -56,5 +56,6 @@ typedef struct _freeEnergyGpuContext *freeEnergyGpuContext;
 extern "C" freeEnergyGpuContext freeEnergyGpuInit( _gpuContext* gpu );
 extern "C" void freeEnergyGpuShutDown(freeEnergyGpuContext gpu);
 extern "C" void freeEnergyGpuSetConstants(freeEnergyGpuContext gpu);
+extern "C" unsigned int getThreadsPerBlockFEP( freeEnergyGpuContext freeEnergyGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock );

 #endif // __FREE_ENERGY_GPUTYPES_H__
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaSoftcoreForces1.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaSoftcoreForces1.cu
@@ -28,6 +28,7 @@
 #include "kernels/cudatypes.h"
 #include "kernels/cudaKernels.h"
 #include "GpuFreeEnergyCudaKernels.h"
+#include "openmm/OpenMMException.h"

 #include <stdio.h>
 #include <cuda.h>
@@ -36,6 +37,7 @@
 #include <sstream>

 #define USE_SOFTCORE_LJ
+//#define DEBUG

 struct Atom {
    float x;
@@ -104,6 +106,7 @@ void SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( freeEnergyGpuContext freeEnergyGpu
 #define METHOD_NAME(a, b) a##PeriodicByWarp##b
 #include "kCalculateCDLJObcGbsaSoftcoreForces1.h"

+
 /**
 * 
 * Calculate Born radii and first GBSA loop forces/energy
@@ -113,16 +116,29 @@ void SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( freeEnergyGpuContext freeEnergyGpu
 */
 void kCalculateCDLJObcGbsaSoftcoreForces1( freeEnergyGpuContext freeEnergyGpu )
 {
-//    printf("kCalculateCDLJObcGbsaForces1\n");
+    unsigned int threadsPerBlock;
+    static unsigned int threadsPerBlockPerMethod[3] = { 0, 0, 0 };
+    static unsigned int natoms[3]                   = { 0, 0, 0 };
    gpuContext gpu                                  = freeEnergyGpu->gpuContext;
+    unsigned int methodIndex                        = static_cast<unsigned int>(freeEnergyGpu->freeEnergySim.nonbondedMethod);
+
+    if( methodIndex > 2 ){
+        throw OpenMM::OpenMMException( "kCalculateCDLJObcGbsaSoftcoreForces1 method index invalid." );
+    }   
+
+    if( natoms[methodIndex] != gpu->natoms ){
+        unsigned int extra                    = methodIndex == 0 ? 0 : sizeof(float);
+        threadsPerBlockPerMethod[methodIndex] = std::min(getThreadsPerBlockFEP( freeEnergyGpu, (sizeof(Atom) + extra), gpu->sharedMemoryPerBlock ), gpu->sim.nonbond_threads_per_block );
+        natoms[methodIndex]                   = gpu->natoms;
+    }   
+    threadsPerBlock = threadsPerBlockPerMethod[methodIndex];

-//fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreForces1 cutoff=%15.7e blks=%u thread/.block=%u nbMethod==%d warp=%u\n",
-//         gpu->sim.nonbondedCutoffSqr, gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, 
-//         freeEnergyGpu->freeEnergySim.nonbondedMethod, gpu->bOutputBufferPerWarp);

-//#define DEBUG
 #ifdef DEBUG 
-fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreForces1 cutoff=%15.7e\n", gpu->sim.nonbondedCutoffSqr );
+fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreForces1 blks=%u thread/block=%u %u shMem=%u nbMethod==%d warp=%u\n",
+         gpu->sim.nonbond_blocks, threadsPerBlock, gpu->sim.nonbond_threads_per_block, sizeof(Atom)*threadsPerBlock,
+         freeEnergyGpu->freeEnergySim.nonbondedMethod, gpu->bOutputBufferPerWarp);
+
 int psize = gpu->sim.paddedNumberOfAtoms;
 CUDAStream<float4>* pdE1 = new CUDAStream<float4>( psize, 1, "pdE");
 CUDAStream<float4>* pdE2 = new CUDAStream<float4>( psize, 1, "pdE");
@@ -133,12 +149,12 @@ float atomicRadii;
 showWorkUnitsFreeEnergy( freeEnergyGpu, 1 );
 for( int ii = 0; ii < psize; ii++ ){

-pdE1->_pSysData[ii].x = 0.001f;
+pdE1->_pSysData[ii].x = 0.0f;
 pdE1->_pSysData[ii].y = 0.001f;
 pdE1->_pSysData[ii].z = 0.001f;
 pdE1->_pSysData[ii].w = 0.001f;

-pdE2->_pSysData[ii].x = 0.001f;
+pdE2->_pSysData[ii].x = 0.0f;
 pdE2->_pSysData[ii].y = 0.001f;
 pdE2->_pSysData[ii].z = 0.001f;
 pdE2->_pSysData[ii].w = 0.001f;
@@ -153,19 +169,23 @@ pdE2->Upload();

            // use softcore LJ potential
 #ifdef DEBUG
+            (void) fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreForces1 ver=%u blks=%u threadsPerBlock=%u shMem=%u %u wrp=%u\n", gpu->sm_version,
+                            gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(Atom)*threadsPerBlock, gpu->sharedMemoryPerBlock, gpu->bOutputBufferPerWarp ); fflush( stderr );
+
            if (gpu->bOutputBufferPerWarp)
-                   kCalculateCDLJObcGbsaSoftcoreN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                           sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit,pdE1->_pDevData, pdE2->_pDevData);
+                   kCalculateCDLJObcGbsaSoftcoreN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                           sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit,pdE1->_pDevData, pdE2->_pDevData);
            else
-                   kCalculateCDLJObcGbsaSoftcoreN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                           sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
+                   kCalculateCDLJObcGbsaSoftcoreN2Forces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                           sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
 #else   
            if (gpu->bOutputBufferPerWarp)
-                   kCalculateCDLJObcGbsaSoftcoreN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                           sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit );
+                   kCalculateCDLJObcGbsaSoftcoreN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                           sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit );
            else
-                   kCalculateCDLJObcGbsaSoftcoreN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                           sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit );
+                   kCalculateCDLJObcGbsaSoftcoreN2Forces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                           sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit );
+   

 #endif
            LAUNCHERROR("kCalculateCDLJObcGbsaSoftcoreForces1");
@@ -174,52 +194,102 @@ pdE2->Upload();
        case FREE_ENERGY_CUTOFF:

 #ifdef DEBUG
+            (void) fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreCutoffForces1 %6d blks=%u nonbond_threads_per_block=%5u shMem=%5u\n",
+                            gpu->natoms, gpu->sim.nonbond_blocks, threadsPerBlock, (sizeof(Atom)+sizeof(float))*threadsPerBlock);
+            (void) fflush( stderr );
+

            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJObcGbsaSoftcoreCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
+                kCalculateCDLJObcGbsaSoftcoreCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
            else
-                kCalculateCDLJObcGbsaSoftcoreCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
+                kCalculateCDLJObcGbsaSoftcoreCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
 #else
            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJObcGbsaSoftcoreCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
+                kCalculateCDLJObcGbsaSoftcoreCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
            else
-                kCalculateCDLJObcGbsaSoftcoreCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
+                kCalculateCDLJObcGbsaSoftcoreCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
 #endif

+            LAUNCHERROR("kCalculateCDLJObcGbsaSoftcoreCutoffForces1");
+
            break;

        case FREE_ENERGY_PERIODIC:

+#ifdef DEBUG
            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJObcGbsaSoftcorePeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+                kCalculateCDLJObcGbsaSoftcorePeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
            else
-                kCalculateCDLJObcGbsaSoftcorePeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+                kCalculateCDLJObcGbsaSoftcorePeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
+
+#else
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCDLJObcGbsaSoftcorePeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
+            else
+                kCalculateCDLJObcGbsaSoftcorePeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
+#endif

            LAUNCHERROR("kCalculateCDLJObcGbsaSoftcorePeriodicForces1");
            break;
    }

+
+
 #ifdef DEBUG 
+/*
+gpu->psBornForce->Download();
+freeEnergyGpu->psSwitchDerivative->Download();
+gpu->psBornRadii->Download();
+fprintf( stderr, "XX bR=%15.7e swd=%15.7e\n", gpu->psBornRadii->_pSysData[0], freeEnergyGpu->psSwitchDerivative->_pSysData[0] );
+for( int ii = 0; ii < gpu->sim.nonbondOutputBuffers; ii++ ){
+    fprintf( stderr, "strx %4d %15.7e %15.7e %15.7e %15.7e\n", ii, 
+             gpu->psBornForce->_pSysStream[ii][0],
+             gpu->psBornForce->_pSysStream[ii][1],
+             gpu->psBornForce->_pSysStream[ii][2],
+             gpu->psBornForce->_pSysStream[ii][3] );
+    if(  gpu->natoms > 1984 ){
+    int idx = 1983;
+    fprintf( stderr, "stry %4d %15.7e %15.7e %15.7e %15.7e %5d\n", ii, 
+             gpu->psBornForce->_pSysStream[ii][idx+0],
+             gpu->psBornForce->_pSysStream[ii][idx+1],
+             gpu->psBornForce->_pSysStream[ii][idx+2],
+             gpu->psBornForce->_pSysStream[ii][idx+3], idx );
+    }
+}
+int bufferI = 62;
+if(  bufferI < gpu->sim.nonbondOutputBuffers ){
+    fprintf( stderr, "BufferI %4d \n", bufferI );
+for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
+    fprintf( stderr, "strz %4d %15.7e\n",  ii, gpu->psBornForce->_pSysStream[bufferI][ii] );
+}
+}
+*/
 pdE1->Download();
 pdE2->Download();
 gpu->psPosq4->Download();
 gpu->psGBVIData->Download();
 gpu->psBornRadii->Download();
+gpu->psBornForce->Download();
 freeEnergyGpu->psSwitchDerivative->Download();
 fprintf( stderr, "PdeCud %d\n", TARGET );
 bF = 0.0;
-for( int ii = 0; ii < gpu->natoms; ii++ ){
+int count =0;
+for( int ii = 0; ii < psize; ii++ ){
 bF += pdE1->_pSysData[ii].x;
-if( fabs( pdE1->_pSysData[ii].w ) > 1.0e-03 ){
-fprintf( stderr, "%4d %15.7e %15.7e %15.7e %15.7e    %15.7e %15.7e %15.7e %15.7e\n", ii, 
+if( fabs( pdE1->_pSysData[ii].w ) > 1.0e-03 && fabs( pdE1->_pSysData[ii].x ) > 0.0 ){
+count++;
+fprintf( stderr, "%4d %4d %15.7e %15.7e %15.7e %15.7e    %15.7e %15.7e %15.7e %15.7e bF=%15.7e swd=%15.7e\n", count, ii, 
         pdE1->_pSysData[ii].x, pdE1->_pSysData[ii].y, pdE1->_pSysData[ii].z, pdE1->_pSysData[ii].w,
-         pdE2->_pSysData[ii].x, pdE2->_pSysData[ii].y, pdE2->_pSysData[ii].z, pdE2->_pSysData[ii].w );
+         pdE2->_pSysData[ii].x, pdE2->_pSysData[ii].y, pdE2->_pSysData[ii].z, pdE2->_pSysData[ii].w, gpu->psBornForce->_pSysData[ii],
+         freeEnergyGpu->psSwitchDerivative->_pSysData[ii] );
 }
 }
 bR      = gpu->psBornRadii->_pSysData[TARGET];
@@ -228,7 +298,7 @@ ratio   = (atomicRadii/bR);
 bF1     = bF + (3.0f*gpu->psGBVIData->_pSysData[TARGET].z*ratio*ratio*ratio)/bR; 
 b2      = bR*bR;
 bF1     *= (1.0f/3.0f)*b2*b2;
-fprintf( stderr, "sumbF Cud %6d %15.7e %15.7e %15.7e\n", TARGET, bF, bF1, bR);
+fprintf( stderr, "sumbF Cud %6d count=%d %15.7e %15.7e %15.7e\n", TARGET, count, bF, bF1, bR);
 #endif

 }
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaSoftcoreForces1.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaSoftcoreForces1.h
@@ -37,7 +37,7 @@
 #endif

 #undef TARGET
-//#define TARGET 0
+#define TARGET 1926

 __global__ 
 #if (__CUDA_ARCH__ >= 200)
@@ -47,12 +47,18 @@ __launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
 #else
 __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
 #endif
+#ifdef DEBUG
+void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* workUnit, float4* pdE1, float4* pdE2 )
+#else
 void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* workUnit )
+#endif
 {
-//void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* workUnit, float4* pdE1, float4* pdE2 )
    extern __shared__ Atom sA[];
-    unsigned int totalWarps        = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
+    unsigned int totalWarps        = gridDim.x*blockDim.x/GRID;
    unsigned int warp              = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+
+    //unsigned int totalWarps        = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
+    //unsigned int warp              = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
    unsigned int numWorkUnits      = cSim.pInteractionCount[0];
    unsigned int pos               = warp*numWorkUnits/totalWarps;
    unsigned int end               = (warp+1)*numWorkUnits/totalWarps;
@@ -166,23 +172,22 @@ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* wo
                    af.w                   += dGpol_dalpha2_ij * psA[j].br;


-/*
+#ifdef DEBUG
 int jIdx = j;
 if( i == TARGET ){
 int tjj     = y+jIdx;
-pdE1[tjj].x = dGpol_dalpha2_ij * psA[j].br;
+pdE1[tjj].x = dGpol_dalpha2_ij * psA[jIdx].br;
 pdE1[tjj].y = sqrt(r2);
-pdE1[tjj].z =  (q2 * psA[j].q) / denominator;
 pdE1[tjj].w = 1.0f;
 }

 if( (y+jIdx) == TARGET ){
 int tjj     = i;
-pdE1[tjj].x = dEdR - Gpol * (1.0f - 0.25f * expTerm);
-pdE1[tjj].y = r2;
-pdE1[tjj].z = CDLJObcGbsa_energy - (q2 * psA[jIdx].q) / denominator;
+pdE1[tjj].x = dGpol_dalpha2_ij * psA[jIdx].br;
+pdE1[tjj].y = sqrt(r2);
 pdE1[tjj].w = -1.0f;
-} */
+}
+#endif
                    energy                 += 0.5f*CDLJObcGbsa_energy;

                    // Add Forces
@@ -248,7 +253,6 @@ pdE1[tjj].w = -1.0f;
                        dEdR                = 0.0f;
                        CDLJObcGbsa_energy  = 0.0f;
                    }
-//float dEdRx = dEdR;

                    // ObcGbsaForce1 part

@@ -271,28 +275,23 @@ pdE1[tjj].w = -1.0f;
                        dEdR               = 0.0f;
 		                  CDLJObcGbsa_energy = 0.0f;
                        dGpol_dalpha2_ij   = 0.0f;
-//dEdRx = 0.0f;
                    }

-/*
+#ifdef DEBUG
 int jIdx    = j;
 if( i == TARGET ){
 int tjj     =  (y+jIdx);
-pdE1[tjj].x = dGpol_dalpha2_ij * psA[j].br;
-pdE1[tjj].y = sqrt(r2);
-pdE1[tjj].z =  (q2 * psA[j].q) / denominator;
-pdE1[tjj].x = dEdRx;
+pdE1[tjj].x = dGpol_dalpha2_ij * psA[jIdx].br;
 pdE1[tjj].y = sqrt(r2);
-pdE1[tjj].z = CDLJObcGbsa_energy - (q2 * psA[jIdx].q) / denominator;
 pdE1[tjj].w = 2.0f;
 }
 if( (y+jIdx) == TARGET ){
 int tjj     = i;
-pdE1[tjj].x = dGpol_dalpha2_ij * psA[j].br;
+pdE1[tjj].x = dGpol_dalpha2_ij * psA[jIdx].br;
 pdE1[tjj].y = sqrt(r2);
-pdE1[tjj].z =  (q2 * psA[j].q) / denominator;
 pdE1[tjj].w = -2.0f;
-} */
+}
+#endif

                    af.w                  += dGpol_dalpha2_ij * psA[j].br;
                    energy                += 0.5f*CDLJObcGbsa_energy;
@@ -361,7 +360,8 @@ pdE1[tjj].w = -2.0f;
                {
                    // No interactions in this block.
                }
-                else if (flags == 0xFFFFFFFF)
+                //else if (flags == 0xFFFFFFFF)
+                else if (flags)
 #endif
                {
                    // Compute all interactions within this block.
@@ -401,7 +401,6 @@ pdE1[tjj].w = -2.0f;
                        CDLJObcGbsa_energy     += factorX;
 #endif
                        dEdR                   *= invR * invR;
-//float dEdRx = dEdR;

                        // ObcGbsaForce1 part

@@ -423,7 +422,6 @@ pdE1[tjj].w = -2.0f;
                            dEdR               = 0.0f;
       			             CDLJObcGbsa_energy = 0.0f;
                            dGpol_dalpha2_ij   = 0.0f;
-//dEdRx = 0.0f;
                        }
                        psA[tj].fb             += dGpol_dalpha2_ij * br;
                        af.w                   += dGpol_dalpha2_ij * psA[tj].br;
@@ -431,22 +429,21 @@ pdE1[tjj].w = -2.0f;



-/*
+#ifdef DEBUG
 int jIdx = tj;
 if( i == TARGET ){
 int tjj     = y+jIdx;
-pdE1[tjj].x = dEdRx;
+pdE1[tjj].x = dGpol_dalpha2_ij * psA[jIdx].br;
 pdE1[tjj].y = sqrt(r2);
-pdE1[tjj].z = -dEdRx*dz;
 pdE1[tjj].w = 3.0f;
 }
 if( (y+jIdx) == TARGET ){
 int tjj     = i;
-pdE1[tjj].x = dEdRx;
+pdE1[tjj].x = dGpol_dalpha2_ij * br;
 pdE1[tjj].y = sqrt(r2);
-pdE1[tjj].z = dEdRx*dz;
 pdE1[tjj].w = -3.0f;
-} */
+}
+#endif



@@ -547,22 +544,21 @@ pdE1[tjj].w = -3.0f;
                            if (tgx == 0)
                                psA[j].fb += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];

-/*
+#ifdef DEBUG
 int jIdx = j;
 if( i == TARGET ){
 int tjj     = y+jIdx;
-pdE1[tjj].x = dEdR;
-pdE1[tjj].y = r2;
-pdE1[tjj].z = CDLJObcGbsa_energy - (q2 * psA[jIdx].q) / denominator;
-pdE1[tjj].w = -4.7f;
+pdE1[tjj].x = dGpol_dalpha2_ij * psA[j].br;
+pdE1[tjj].y = sqrt(r2);
+pdE1[tjj].w = 4.0f;
 }
 if( (y+jIdx) == TARGET ){
 int tjj     = i;
-pdE1[tjj].x = dEdR;
-pdE1[tjj].y = r2;
-pdE1[tjj].z = CDLJObcGbsa_energy - (q2 * psA[jIdx].q) / denominator;
-pdE1[tjj].w = -4.7f;
-} */
+pdE1[tjj].x = tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
+pdE1[tjj].y = sqrt(r2);
+pdE1[tjj].w = -4.0f;
+} 
+#endif



@@ -658,7 +654,7 @@ pdE1[tjj].w = -4.7f;
                        dEdR               = 0.0f;
 			               CDLJObcGbsa_energy = 0.0f;
                    }
-//float dEdRx = dEdR;
+
                    // ObcGbsaForce1 part

                    float alpha2_ij         = br * psA[tj].br;
@@ -679,25 +675,25 @@ pdE1[tjj].w = -4.7f;
                        dEdR               = 0.0f;
 			               CDLJObcGbsa_energy = 0.0f;
                        dGpol_dalpha2_ij   = 0.0f;
-//dEdRx = 0.0;
                    }

-/*
+#ifdef DEBUG
 int jIdx = tj;
 if( i == TARGET ){
 int tjj     = y+jIdx;
-pdE1[tjj].x = dEdRx;
+pdE1[tjj].x = dGpol_dalpha2_ij * psA[tj].br;
 pdE1[tjj].y = sqrt(r2);
-pdE1[tjj].z = dEdRx*dz;
+pdE1[tjj].z = dGpol_dalpha2_ij;
 pdE1[tjj].w = 6.0f;
 }
 if( (y+jIdx) == TARGET ){
 int tjj     = i;
-pdE1[tjj].x = dEdRx;
+pdE1[tjj].x = dGpol_dalpha2_ij * br;
 pdE1[tjj].y = sqrt(r2);
-pdE1[tjj].z = dEdRx*dz;
+pdE1[tjj].z = dGpol_dalpha2_ij;
 pdE1[tjj].w = -6.0f;
-}  */
+} 
+#endif




--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.cu
@@ -67,7 +67,7 @@ void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn

    static const float electricConstant               = -166.02691f;
    double tau                                        = ((1.0f/innerDielectric)-(1.0f/solventDielectric)); 
-    freeEnergyGpu->psSwitchDerivative                 = new CUDAStream<float>( numberOfParticles, 1, "SwitchDerivative");
+    freeEnergyGpu->psSwitchDerivative                 = new CUDAStream<float>( gpu->sim.paddedNumberOfAtoms, 1, "SwitchDerivative");
    freeEnergyGpu->freeEnergySim.pSwitchDerivative    = freeEnergyGpu->psSwitchDerivative->_pDevData;

    // create gpuGBVISoftcore, load parameters, and track minimum softcore value
@@ -92,6 +92,8 @@ void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn
            (*gpu->psGBVIData)[ii].y                 = scaledRadii[ii];
            (*gpu->psGBVIData)[ii].z                 = tau*gamma[ii];
            (*gpu->psGBVIData)[ii].w                 = bornRadiusScaleFactors[ii];
+            (*gpu->psBornRadii)[ii]                  = 0.0f;
+            (*freeEnergyGpu->psSwitchDerivative)[ii] = 0.0f;
    }

    // Dummy out extra atom data
@@ -101,6 +103,8 @@ void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn
        (*gpu->psGBVIData)[ii].y                 = 0.01f;
        (*gpu->psGBVIData)[ii].z                 = 0.0f;
        (*gpu->psGBVIData)[ii].w                 = 0.0f;
+        (*gpu->psBornRadii)[ii]                  = 0.0f;
+        (*freeEnergyGpu->psSwitchDerivative)[ii] = 0.0f;
    }

    gpu->sim.preFactor               = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
@@ -141,6 +145,8 @@ void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn
    }

    gpu->psGBVIData->Upload();
+    gpu->psBornRadii->Upload();
+    freeEnergyGpu->psSwitchDerivative->Upload();

    return;
 }
@@ -170,7 +176,15 @@ void kClearGBVISoftcoreBornSum(gpuContext gpu) {
    kClearGBVISoftcoreBornSum_kernel<<<gpu->sim.blocks, 384>>>();
 }

-__global__ void kReduceGBVISoftcoreBornForces_kernel()
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
+#endif
+void kReduceGBVISoftcoreBornForces_kernel()
 {
    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
    float energy     = 0.0f;
@@ -233,7 +247,15 @@ void kReduceGBVISoftcoreBornForces( freeEnergyGpuContext freeEnergyGpu )

 }

-__global__ void kReduceGBVISoftcoreBornSum_kernel()
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
+#endif
+void kReduceGBVISoftcoreBornSum_kernel()
 {
    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
    
@@ -362,7 +384,15 @@ void kReduceGBVIBornSumQuinticScaling( freeEnergyGpuContext freeEnergyGpu )
    LAUNCHERROR("kReduceGBVIBornSumQuinticScaling_kernel");
 }

-__global__ void kReduceGBVIBornForcesQuinticScaling_kernel()
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
+#endif
+void kReduceGBVIBornForcesQuinticScaling_kernel()
 {
    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
    float energy     = 0.0f;
@@ -418,9 +448,12 @@ __global__ void kReduceGBVIBornForcesQuinticScaling_kernel()

 void kReduceGBVIBornForcesQuinticScaling( freeEnergyGpuContext freeEnergyGpu )
 {
-    //printf("kReduceObcGbsaBornForces\n");
    gpuContext gpu = freeEnergyGpu->gpuContext;
-    kReduceGBVIBornForcesQuinticScaling_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
+
+    //(void) fprintf( stderr, "kReduceObcGbsaBornForces %6d blks=%u bsf_reduce_threads_per_block=%5u %5u   %5u %5u %5u\n",
+    //                 gpu->natoms, gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block, gpu->sim.bf_reduce_threads_per_block,
+    //                 GF1XX_THREADS_PER_BLOCK, GT2XX_THREADS_PER_BLOCK, G8X_THREADS_PER_BLOCK); fflush( stderr );
+    kReduceGBVIBornForcesQuinticScaling_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
    LAUNCHERROR("kReduceGBVIBornForcesQuinticScaling");
 }

@@ -441,7 +474,7 @@ void kPrintGBVISoftcore( freeEnergyGpuContext freeEnergyGpu, std::string callId,
    sigEps4->Download();

    (void) fprintf( log, "kPrintGBViSoftcore Cuda comp bR bF swd   prm    sigeps4\n" );
-    for( int ii = 0; ii < gpu->natoms; ii++ ){
+    for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
        (void) fprintf( log, "%6d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e \n",
                        ii, 
                        gpu->psBornRadii->_pSysData[ii],
@@ -474,10 +507,28 @@ extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int
 void kCalculateGBVISoftcoreBornSum( freeEnergyGpuContext freeEnergyGpu )
 {

+    unsigned int threadsPerBlock;
+    static unsigned int threadsPerBlockPerMethod[3] = { 0, 0, 0 };
+    static unsigned int natoms[3]                   = { 0, 0, 0 };
    gpuContext gpu                                  = freeEnergyGpu->gpuContext;
+    unsigned int methodIndex                        = static_cast<unsigned int>(freeEnergyGpu->freeEnergySim.nonbondedMethod);
+
+    if( methodIndex > 2 ){
+        throw OpenMM::OpenMMException( "kCalculateGBVISoftcoreBornSum method index invalid." );
+    }
+
+    if( natoms[methodIndex] != gpu->natoms ){
+        unsigned int extra                    = methodIndex == 0 ? 0 : sizeof(float);
+        threadsPerBlockPerMethod[methodIndex] = std::min(getThreadsPerBlockFEP( freeEnergyGpu, (sizeof(Atom) + extra), gpu->sharedMemoryPerBlock ), gpu->sim.nonbond_threads_per_block );
+        natoms[methodIndex]                   = gpu->natoms;
+    }   
+    threadsPerBlock = threadsPerBlockPerMethod[methodIndex];
+

 #ifdef DEBUG
-fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreForces1 cutoff=%15.7e\n", gpu->sim.nonbondedCutoffSqr );
+fprintf( stderr, "kCalculateGBVISoftcoreBornSum blks=%u threadsPerBlock=%u %u  shMem=%u\n",
+         gpu->sim.nonbond_blocks, threadsPerBlock, gpu->sim.nonbond_threads_per_block, (sizeof(Atom)+sizeof(float))*threadsPerBlock ); fflush( stderr );
+
 int psize = gpu->sim.paddedNumberOfAtoms;
 CUDAStream<float4>* pdE1 = new CUDAStream<float4>( psize, 1, "pdE");
 CUDAStream<float4>* pdE2 = new CUDAStream<float4>( psize, 1, "pdE");
@@ -511,19 +562,19 @@ pdE2->Upload();

 #ifdef DEBUG
            if (gpu->bOutputBufferPerWarp){
-                kCalculateGBVISoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
+                kCalculateGBVISoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
            } else {
-                kCalculateGBVISoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
+                kCalculateGBVISoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
            }
 #else
            if (gpu->bOutputBufferPerWarp){
-                kCalculateGBVISoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
+                kCalculateGBVISoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
            } else {
-                kCalculateGBVISoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
+                kCalculateGBVISoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
            }
 #endif

@@ -536,31 +587,31 @@ pdE2->Upload();
            kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
            LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-            kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                    sizeof(unsigned int)*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);

 #ifdef DEBUG
            (void) fprintf( stderr, "kCalculateGBVISoftcoreBornSum cutoff=%15.7e warp=%u GridBoundingBox.length=%u interaction_blocks=%u interaction_threads_per_block=%u nonbond_blocks=%u nonbond_threads_per_block=%u\n",
                            gpu->sim.nonbondedCutoffSqr, gpu->bOutputBufferPerWarp, gpu->psGridBoundingBox->_length, gpu->sim.interaction_blocks,
-                            gpu->sim.interaction_threads_per_block, gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block ); fflush( stderr );
+                            gpu->sim.interaction_threads_per_block, gpu->sim.nonbond_blocks, threadsPerBlock ); fflush( stderr );


            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVISoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
+                kCalculateGBVISoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
            else
-                kCalculateGBVISoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData );
+                kCalculateGBVISoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData );

 #else
            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVISoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+                kCalculateGBVISoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
            else
-                kCalculateGBVISoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
-            break;
+                kCalculateGBVISoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
 #endif
+            break;

        case FREE_ENERGY_PERIODIC:

@@ -569,23 +620,23 @@ pdE2->Upload();
            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                    sizeof(unsigned int)*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);

 #ifdef DEBUG
            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVISoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData  );
+                kCalculateGBVISoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData  );
            else
-                kCalculateGBVISoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData  );
+                kCalculateGBVISoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData  );
 #else
            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVISoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
+                kCalculateGBVISoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
            else
-                kCalculateGBVISoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
+                kCalculateGBVISoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
 #endif
            break;


--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.h
@@ -38,7 +38,7 @@
 #include "kCalculateGBVIAux.h"

 #undef TARGET
-//#define TARGET 5443
+//#define TARGET 39

 __global__ 
 #if (__CUDA_ARCH__ >= 200)

--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.cu
@@ -29,6 +29,10 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

+#include "gputypes.h"
+#include "GpuFreeEnergyCudaKernels.h"
+#include "openmm/OpenMMException.h"
+
 #include <stdio.h>
 #include <cuda.h>
 #include <vector_functions.h>
@@ -36,10 +40,8 @@
 #include <string>
 #include <iostream>
 #include <fstream>
-using namespace std;

-#include "gputypes.h"
-#include "GpuFreeEnergyCudaKernels.h"
+using namespace std;

 struct Atom {
    float x;
@@ -126,8 +128,6 @@ __global__ void kCalculateGBVISoftcoreForces2a_kernel()

 }

-#define TARGET 0
-
 // Include versions of the kernels for N^2 calculations.

 #define METHOD_NAME(a, b) a##N2##b
@@ -163,18 +163,28 @@ __global__ void kCalculateGBVISoftcoreForces2a_kernel()

 void kCalculateGBVISoftcoreForces2( freeEnergyGpuContext freeEnergyGpu )
 {
+    unsigned int threadsPerBlock;
+    static unsigned int threadsPerBlockPerMethod[3] = { 0, 0, 0 };
+    static unsigned int natoms[3]                   = { 0, 0, 0 };
    gpuContext gpu                                  = freeEnergyGpu->gpuContext;
+    unsigned int methodIndex                        = static_cast<unsigned int>(freeEnergyGpu->freeEnergySim.nonbondedMethod);

-    /*fprintf( stderr,"kCalculateGBVISoftcoreForces2 nonbondedMethod=%d bornForce2_blocks=%d bornForce2_threads_per_block=%d\n",
-             freeEnergyGpu->freeEnergySim.nonbondedMethod,
-             gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, gpu->psInteractionCount->_pSysData[0] ); fflush( stderr );
-    */
+    if( methodIndex > 2 ){
+        throw OpenMM::OpenMMException( "kCalculateGBVISoftcoreForces2 method index invalid." );
+    }   

-    switch (freeEnergyGpu->freeEnergySim.nonbondedMethod)
-    {
-        case FREE_ENERGY_NO_CUTOFF:
+    if( natoms[methodIndex] != gpu->natoms ){
+        unsigned int extra                    = methodIndex == 0 ? 0 : sizeof(float3);
+        threadsPerBlockPerMethod[methodIndex] = std::min(getThreadsPerBlockFEP( freeEnergyGpu, (sizeof(Atom) + extra), gpu->sharedMemoryPerBlock ), gpu->sim.nonbond_threads_per_block );
+        natoms[methodIndex]                   = gpu->natoms;
+    }
+    threadsPerBlock = threadsPerBlockPerMethod[methodIndex];

 #ifdef DEBUG
+    fprintf( stderr,"kCalculateGBVISoftcoreForces2 nonbondedMethod=%d bornForce2_blocks=%u threadsPerBlock=%u shMem=%u\n",
+             freeEnergyGpu->freeEnergySim.nonbondedMethod,
+             gpu->sim.bornForce2_blocks, threadsPerBlock, (sizeof(Atom)+sizeof(float3))*threadsPerBlock ); fflush( stderr );
+    
 int psize = 64;
 CUDAStream<float4>* pdE1 = new CUDAStream<float4>( psize, 1, "pdE");
 CUDAStream<float4>* pdE2 = new CUDAStream<float4>( psize, 1, "pdE");
@@ -194,11 +204,11 @@ pdE1->Upload();
 pdE2->Upload();

            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits, pdE1->_pDevData, pdE2->_pDevData);
+                kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
+                        sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits, pdE1->_pDevData, pdE2->_pDevData);
            else
-                kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits, pdE1->_pDevData, pdE2->_pDevData);
+                kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
+                        sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits, pdE1->_pDevData, pdE2->_pDevData);
 pdE1->Download();
 pdE2->Download();
 fprintf( stderr, "Pde\n" );
@@ -210,32 +220,37 @@ fprintf( stderr, "%4d %15.7e %15.7e %15.7e %15.7e    %15.7e %15.7e %15.7e %15.7e
            break;
 #endif

+    switch (freeEnergyGpu->freeEnergySim.nonbondedMethod)
+    {
+        case FREE_ENERGY_NO_CUTOFF:
+
+
            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
+                kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
+                        sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
            else
-                kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
+                kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
+                        sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
            break;

        case FREE_ENERGY_CUTOFF:

            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVISoftcoreCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
+                kCalculateGBVISoftcoreCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
            else
-                kCalculateGBVISoftcoreCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
+                kCalculateGBVISoftcoreCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
            break;

        case FREE_ENERGY_PERIODIC:

            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVISoftcorePeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
+                kCalculateGBVISoftcorePeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
            else
-                kCalculateGBVISoftcorePeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
+                kCalculateGBVISoftcorePeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
+                        (sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
            break;

    }

--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.h
@@ -49,8 +49,11 @@ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int* workUnit
 {
 //METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int* workUnit, float4* pdE1, float4* pdE2 )
    extern __shared__ Atom sA[];
-    unsigned int totalWarps   = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
+    unsigned int totalWarps   = gridDim.x*blockDim.x/GRID;
    unsigned int warp         = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+
+    //unsigned int totalWarps   = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
+    //unsigned int warp         = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
    unsigned int numWorkUnits = cSim.pInteractionCount[0];
    unsigned int pos          = warp*numWorkUnits/totalWarps;
    unsigned int end          = (warp+1)*numWorkUnits/totalWarps;
@@ -191,7 +194,8 @@ if( (x+j) == TARGET ){
            {
                // No interactions in this block.
            }
-            else if (flags == 0xFFFFFFFF)
+            //else if (flags == 0xFFFFFFFF)
+            else if (flags)
 #endif
            {
                // Compute all interactions within this block.

--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreBornSum.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreBornSum.cu
@@ -275,6 +275,7 @@ void  gpuSetObcSoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn
        (*gpu->psObcData)[ii].x                        = radius[ii] - dielectricOffset;
        (*gpu->psObcData)[ii].y                        = scale[ii] * (*gpu->psObcData)[ii].x;
        (*gpu->psPosq4)[ii].w                          = charge[ii];
+        (*gpu->psBornRadii)[ii]                        = 0.0f;
        (*freeEnergyGpu->psNonPolarScalingFactors)[ii] = nonPolarScalingFactors[ii];
    }

@@ -302,12 +303,14 @@ void  gpuSetObcSoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float inn
        (*gpu->psObcData)[ii].x                         = 0.01f;
        (*gpu->psObcData)[ii].y                         = 0.01f;
        (*freeEnergyGpu->psNonPolarScalingFactors)[ii]  = 0.0f;
+        (*gpu->psBornRadii)[ii]                         = 0.0f;
    }

    // load data to board

    gpu->psObcData->Upload();
    gpu->psPosq4->Upload();
+    gpu->psBornRadii->Upload();
    freeEnergyGpu->psNonPolarScalingFactors->Upload();

    return;
@@ -330,7 +333,7 @@ void kPrintObcGbsaSoftcore( freeEnergyGpuContext freeEnergyGpu, std::string call
    sigEps4->Download();

    (void) fprintf( log, "BornSum Born radii & params\n" );
-    for( int ii = 0; ii < gpu->natoms; ii++ ){
+    for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
        //(void) fprintf( log, "%6d prm[%15.7e %15.7e %15.7e %15.7e] [%15.7e %15.7e %15.7e %15.7e] bR=%15.7e bF=%15.7e swDrv=%3.1f x[%8.3f %8.3f %8.3f %15.7f]\n",
        (void) fprintf( log, "%6d prm[%15.7e %15.7e %15.7e] sig/eps4[%15.7e %15.7e %15.7e %15.7e] bR=%15.7e bF=%15.7e\n",
                        ii,

--- a/plugins/freeEnergy/platforms/cuda/tests/TestCudaGBVISoftcoreForce.cpp
+++ b/plugins/freeEnergy/platforms/cuda/tests/TestCudaGBVISoftcoreForce.cpp
@@ -799,8 +799,8 @@ int main() {
 #ifdef USE_SOFTCORE
        DoubleVector lamda2;
        lamda2.push_back( 1.0 );
-        lamda2.push_back( 0.5 );
-        lamda2.push_back( 0.0 );
+        //lamda2.push_back( 0.5 );
+        //lamda2.push_back( 0.0 );
        if( lamda2.size() > 0 ){
            generativeArgumentMaps["lambda2"] = lamda2;
            inputArgumentMap["lambda2"]       = lamda2[0];

--- a/plugins/freeEnergy/platforms/reference/src/gbsa/CpuGBVISoftcore.cpp
+++ b/plugins/freeEnergy/platforms/reference/src/gbsa/CpuGBVISoftcore.cpp
@@ -23,6 +23,7 @@
 */

 #include <string.h>
+#include <stdio.h>
 #include <math.h>
 #include <sstream>
 #include <vector>
@@ -534,7 +535,6 @@ RealOpenMM CpuGBVISoftcore::computeBornEnergy( const vector<RealOpenMM>& bornRad

   --------------------------------------------------------------------------------------- */

-
 void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, vector<RealVec>& atomCoordinates,
                                         const RealOpenMM* partialCharges, vector<RealVec>& inputForces ){

@@ -569,6 +569,10 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve

    // first main loop

+#undef TARGET
+#define TARGET -1926
+RealOpenMMVector bornForcesT( numberOfAtoms, 0.0 );
+
    for( int atomI = 0; atomI < numberOfAtoms; atomI++ ){
 
        // partial of polar term wrt Born radius
@@ -606,6 +610,10 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve

                bornForces[atomJ] += dGpol_dalpha2_ij*bornRadii[atomI];

+if( atomJ == TARGET ){
+bornForcesT[atomI] = dGpol_dalpha2_ij*bornRadii[atomI];
+}
+
                deltaX            *= dGpol_dr;
                deltaY            *= dGpol_dr;
                deltaZ            *= dGpol_dr;
@@ -620,6 +628,9 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve

           }
           bornForces[atomI] += dGpol_dalpha2_ij*bornRadii[atomJ];
+if( atomI == TARGET ){
+bornForcesT[atomJ] = dGpol_dalpha2_ij*bornRadii[atomJ];
+}

       }
    }
@@ -634,6 +645,25 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve
    RealOpenMMVector& switchDeriviative                 = getSwitchDeriviative();
    const RealOpenMMVector& bornRadiusScaleFactors      = gbviParameters->getBornRadiusScaleFactors();

+if( 0 ){
+double bF = 0.0;
+int count = 0;
+fprintf( stderr, "PdeRef %d\n", TARGET );
+RealOpenMM tau      = static_cast<RealOpenMM>(gbviParameters->getTau());
+for( int atomI = 0; atomI < numberOfAtoms; atomI++ ){
+    bF += bornForcesT[atomI];
+    if( fabs( bornForcesT[atomI] ) > 0.0 ){
+        count++;
+        fprintf( stderr, "%6d %6d bFAdd=%15.7e bR=%15.7e\n", atomI, count, tau*bornForcesT[atomI], bornRadii[atomI] );
+    }
+}
+RealOpenMM  ratio   = (atomicRadii[TARGET]/bornRadii[TARGET]);
+RealOpenMM    bF1   = bF + (three*gammaParameters[TARGET]*ratio*ratio*ratio)/bornRadii[TARGET];
+RealOpenMM b2       = bornRadii[TARGET]*bornRadii[TARGET];
+bF1                *= switchDeriviative[TARGET]*oneThird*b2*b2;
+fprintf( stderr, "sumbF Ref %6d %15.7e %15.7e %15.7e %15.7e\n", TARGET, bF, tau*bF1, bornRadii[TARGET], switchDeriviative[TARGET] );
+}
+
    // ---------------------------------------------------------------------------------------

    for( int atomI = 0; atomI < numberOfAtoms; atomI++ ){
@@ -708,7 +738,7 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve

    }

-    //printGbvi( atomCoordinates, partialCharges, bornRadii,bornForces, forces, "GBVI softcore post loop2", stderr );
+    //printGbvi( atomCoordinates, partialCharges, bornRadii,bornForces, forces, "Reference: GBVI softcore post loop2", stderr );

    // apply prefactor tau = (1/diel_solute - 1/diel_solvent)

@@ -735,7 +765,8 @@ void CpuGBVISoftcore::computeBornForces( const vector<RealOpenMM>& bornRadii, ve

    --------------------------------------------------------------------------------------- */

-void CpuGBVISoftcore::printGbvi( const std::vector<OpenMM::RealVec>& atomCoordinates, const RealOpenMMVector& partialCharges,
+void CpuGBVISoftcore::printGbvi( const std::vector<OpenMM::RealVec>& atomCoordinates,
+                                 const RealOpenMM* partialCharges,
                                 const RealOpenMMVector& bornRadii,
                                 const RealOpenMMVector& bornForces,
                                 const std::vector<OpenMM::RealVec>& forces,

--- a/plugins/freeEnergy/platforms/reference/src/gbsa/CpuGBVISoftcore.h
+++ b/plugins/freeEnergy/platforms/reference/src/gbsa/CpuGBVISoftcore.h
@@ -317,7 +317,7 @@ class CpuGBVISoftcore {
    
        --------------------------------------------------------------------------------------- */
    
-    void printGbvi( const std::vector<OpenMM::RealVec>& atomCoordinates, const RealOpenMMVector& partialCharges,
+    void printGbvi( const std::vector<OpenMM::RealVec>& atomCoordinates, const RealOpenMM* partialCharges,
                    const RealOpenMMVector& bornRadii,
                    const RealOpenMMVector& bornForces,
                    const std::vector<OpenMM::RealVec>& forces,