Extened Free energy plugin to allow cutoffs; cleaned up code and added tests

bc85b9f0 · Mark Friedrichs · d4441c15 · bc85b9f0 · d4441c15 · bc85b9f0
Commit bc85b9f0 authored Oct 24, 2011 by Mark Friedrichs
20 changed files
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaSoftcoreForces1.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaSoftcoreForces1.h
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreAux.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreAux.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Mark Friedrichs                                                   *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-
-#ifndef __Gpu_GBVI_SOFTCORE_AUX_H__
-#define __Gpu_GBVI_SOFTCORE_AUX_H__
-
-/**
- * This file contains subroutines used in evaluating quantities associated w/ the GB/VI function
- */
-
-__device__ float getGBVI_L( float r, float x, float S )
-{
-
-   float rInv   = 1.0f/r;
-
-   float xInv   = 1.0f/x;
-   float xInv2  = xInv*xInv;
-   float diff2  = (r + S)*(r - S);
-
-   return (1.5f*xInv2)*( (0.25f*rInv) - (xInv/3.0f) + (0.125f*diff2*xInv2*rInv) );
-}
-
-__device__ float getGBVI_Volume( float r_ij, float R, float S )
-{
-
-     float upperBound        = r_ij + S; 
-     float rdiffS            = r_ij - S; 
-     float lowerBound        = R > rdiffS ? R : rdiffS;
-     float L_upper           = getGBVI_L( r_ij, upperBound, S );
-     float L_lower           = getGBVI_L( r_ij, lowerBound, S );
-     float mask              = r_ij < (R - S) ? 0.0f : 1.0f;  
-     float addOn             = r_ij < (S - R) ? (1.0f/(R*R*R)) : 0.0f;  
-
-     return (mask*( L_upper - L_lower ) + addOn);
-
-}
-
-__device__ float getGBVI_dL_dr( float r, float x, float S )
-{
-
-   float rInv   = 1.0f/r;
-   float rInv2  = rInv*rInv;
-
-   float xInv   = 1.0f/x;
-   float xInv2  = xInv*xInv;
-   float xInv3  = xInv2*xInv;
-   
-   float diff2  = (r + S)*(r - S);
-
-   return ( (-1.5f*xInv2*rInv2)*( 0.25f + 0.125f*diff2*xInv2 ) + 0.375f*xInv3*xInv );
-   //return 0.0f;
-
-}
-
-__device__ float getGBVI_dL_dx( float r, float x, float S )
-{
-
-   float rInv   = 1.0f/r;
-
-   float xInv   = 1.0f/x;
-   float xInv2  = xInv*xInv;
-   float xInv3  = xInv2*xInv;
-
-   float diff   = (r + S)*(r - S);
-
-   return ( (-1.5f*xInv3)*( (0.5f*rInv) - xInv + (0.5f*diff*xInv2*rInv) ));
-
-
-}
-
-__device__ float getGBVI_dE2( float r, float R, float S, float bornForce )
-{
-
-    float diff              = S - R;
-    float absDiff           = fabsf( S - R );
-    float dE                = getGBVI_dL_dr( r, r+S, S ) + getGBVI_dL_dx( r, r+S, S );
-    float mask;
-    float lowerBound;
-    if( (R > (r - S)) && (absDiff < r) ){
-        mask       = 0.0f;
-        lowerBound = R;
-    } else {
-        mask       = 1.0f;
-        lowerBound = (r - S);  
-    }   
-    dE                    -= getGBVI_dL_dr( r, lowerBound, S ) + mask*getGBVI_dL_dx( r, lowerBound, S );
-    dE                     = (absDiff >= r) && r >= diff ? 0.0f : dE; 
-    dE                    *= ( (r > 1.0e-08f) ? (bornForce/r) : 0.0f);
-
-    return (-dE);
-
-}
-
-__device__ float getGBVIBornForce2( float bornRadius, float R, float bornForce, float gamma )
-{ 
-    float ratio                     = (R/bornRadius);
-    float returnBornForce           = bornForce + (3.0f*gamma*ratio*ratio*ratio)/bornRadius; // 'cavity' term
-    float br2                       = bornRadius*bornRadius;
-          returnBornForce          *= (1.0f/3.0f)*br2*br2;
-
-   return returnBornForce;
-
-}
-
-#endif // __Gpu_GBVI_SOFTCORE_AUX_H__
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.cu
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.h
@@ -37,7 +37,22 @@

 #include "kCalculateGBVIAux.h"

-__global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int* workUnit)
+#undef TARGET
+//#define TARGET 5443
+
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
+#endif
+#ifdef DEBUG
+void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int* workUnit, float4* pdE1, float4* pdE2 )
+#else
+void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int* workUnit)
+#endif
 {
    extern __shared__ Atom sA[];

@@ -47,8 +62,6 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int
    unsigned int pos          = warp*numWorkUnits/totalWarps;
    unsigned int end          = (warp+1)*numWorkUnits/totalWarps;

-//    int end = workUnits / gridDim.x;
-//    int pos = end - (threadIdx.x >> GRIDBITS) - 1;
 #ifdef USE_CUTOFF
    float* tempBuffer         = (float*) &sA[cSim.nonbond_threads_per_block];
 #endif
@@ -85,7 +98,7 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int
            sA[threadIdx.x].r                       = ar.x;
            sA[threadIdx.x].sr                      = ar.y;
            sA[threadIdx.x].bornRadiusScaleFactor   = ar.w;
-            apos.w                                  = 0.0f;
+            float bSum                              = 0.0f;

            for (unsigned int j = 0; j < GRID; j++)
            {
@@ -98,55 +111,80 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int
                dz                                 -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
 #endif
                r2                                  = dx * dx + dy * dy + dz * dz;
-#if defined USE_PERIODIC
-                if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#elif defined USE_CUTOFF
-                if (r2 < cSim.nonbondedCutoffSqr)
+#if defined USE_CUTOFF
+                if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr && j != tgx)
+#else
+                if (i < cSim.atoms && x+j < cSim.atoms && j != tgx )
 #endif
                {
-                    r                       = sqrt(r2);
-                    if ((j != tgx) )
-                    {
-                        apos.w             += psA[j].bornRadiusScaleFactor*getGBVI_Volume( r, ar.x, psA[j].sr );
-                    }
+                    bSum  += psA[j].bornRadiusScaleFactor*getGBVI_Volume( sqrt(r2), ar.x, psA[j].sr );
+
+#ifdef DEBUG
+int jIdx = j;
+if( i == TARGET ){
+int tjj     = y+jIdx;
+pdE1[tjj].x = psA[jIdx].bornRadiusScaleFactor*getGBVI_Volume( sqrt(r2), ar.x, psA[jIdx].sr );
+pdE1[tjj].y = psA[jIdx].bornRadiusScaleFactor;
+pdE1[tjj].z = ar.x;
+pdE1[tjj].w = 1.0f;
+pdE2[tjj].x = sqrt(r2);
+pdE2[tjj].y = psA[jIdx].sr;
+pdE2[tjj].z = ar.x;
+pdE2[tjj].w = 1.0f;
+}
+if( (y+jIdx) == TARGET ){
+int tjj     = i;
+pdE1[tjj].x =  psA[jIdx].bornRadiusScaleFactor*getGBVI_Volume( sqrt(r2), ar.x, psA[jIdx].sr );
+pdE1[tjj].y =  psA[jIdx].bornRadiusScaleFactor;
+pdE1[tjj].z = ar.x;
+pdE1[tjj].w = -1.0f;
+} 
+#endif
+
                }
            }

            // Write results
 #ifdef USE_OUTPUT_BUFFER_PER_WARP
            unsigned int offset = x + tgx + warp*cSim.stride;
-            cSim.pBornSum[offset] += apos.w;
+            cSim.pBornSum[offset] += bSum;
 #else
            unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = apos.w;
+            cSim.pBornSum[offset] = bSum;
 #endif
-        }
-        else        // 100% utilization
-        {
+
+
+        } else {
+
            // Read fixed atom data into registers and GRF
            unsigned int j                              = y + tgx;
            unsigned int i                              = x + tgx;

            float4 temp                                 = cSim.pPosq[j];
            float4 temp1                                = cSim.pGBVIData[j];
+
            float4 apos                                 = cSim.pPosq[i];        // Local atom x, y, z, sum
            float4 ar                                   = cSim.pGBVIData[i];    // Local atom vr, sr
+
            sA[threadIdx.x].x                           = temp.x;
            sA[threadIdx.x].y                           = temp.y;
            sA[threadIdx.x].z                           = temp.z;
+
            sA[threadIdx.x].r                           = temp1.x;
            sA[threadIdx.x].sr                          = temp1.y;
            sA[threadIdx.x].bornRadiusScaleFactor       = temp1.w;
-            sA[threadIdx.x].sum             = apos.w    = 0.0f;
+
+            sA[threadIdx.x].sum                         = 0.0f;
+            apos.w                                      = 0.0f;

 #ifdef USE_CUTOFF
-            //unsigned int flags = cSim.pInteractionFlag[pos + (blockIdx.x*workUnits)/gridDim.x];
            unsigned int flags = cSim.pInteractionFlag[pos];
            if (flags == 0)
            {
                // No interactions in this block.
            }
            else if (flags == 0xFFFFFFFF)
+            //else if (flags )
 #endif
            {
                // Compute all interactions within this block.
@@ -162,10 +200,10 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int
                    dz                     -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
 #endif
                    r2                      = dx * dx + dy * dy + dz * dz;
-#ifdef USE_PERIODIC
+#ifdef USE_CUTOFF
                    if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#elif defined USE_CUTOFF
-                    if (r2 < cSim.nonbondedCutoffSqr)
+#else
+                    if (i < cSim.atoms && y+tj < cSim.atoms )
 #endif
                    {
                        r                       = sqrt(r2);
@@ -175,9 +213,37 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int

                        apos.w                 += psA[tj].bornRadiusScaleFactor*getGBVI_Volume( r, ar.x, psA[tj].sr );
                        psA[tj].sum            += ar.w*getGBVI_Volume( r, psA[tj].r, ar.y );
+
+#ifdef DEBUG
+int jIdx = tj;
+if( i == TARGET ){
+
+int tjj     = y+jIdx;
+pdE1[tjj].x = psA[jIdx].bornRadiusScaleFactor*getGBVI_Volume( r, ar.x, psA[jIdx].sr );
+pdE1[tjj].y = psA[jIdx].bornRadiusScaleFactor;
+pdE1[tjj].z = ar.x;
+pdE1[tjj].w = 2.0f;
+
+float R =  ar.x;
+float S =  psA[tj].sr;
+pdE2[tjj].x = getGBVI_L( r, (r + S), S );
+pdE2[tjj].y = -getGBVI_L( r, (r - S), S );
+pdE2[tjj].z = -getGBVI_L( r, R, S );
+pdE2[tjj].w = (1.0f/(R*R*R));
+
+}
+if( (y+jIdx) == TARGET ){
+int tjj     = i;
+pdE1[tjj].x = ar.w*getGBVI_Volume( r, psA[jIdx].r, ar.y );
+pdE1[tjj].y = ar.w;
+pdE1[tjj].z = psA[jIdx].r;
+pdE1[tjj].w = -2.0f;
+}
+#endif
                    }
                    tj = (tj - 1) & (GRID - 1);
                }
+
            }
 #ifdef USE_CUTOFF
            else
@@ -198,14 +264,15 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int
                        dz                     -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
 #endif
                        r2                      = dx * dx + dy * dy + dz * dz;
-#ifdef USE_PERIODIC
+#ifdef USE_CUTOFF
                        if (i < cSim.atoms && y+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#elif defined USE_CUTOFF
-                        if (r2 < cSim.nonbondedCutoffSqr)
+#else
+                        if (i < cSim.atoms && y+j < cSim.atoms)
 #endif
                        {
                            r                       = sqrt(r2);
-                            tempBuffer[threadIdx.x] = ar.w*getGBVI_Volume( r, psA[tj].r, ar.y );
+                            tempBuffer[threadIdx.x] = ar.w*getGBVI_Volume( r, psA[j].r, ar.y );
+                            apos.w                 += psA[j].bornRadiusScaleFactor*getGBVI_Volume( r, ar.x, psA[j].sr );
                        }

                        // Sum the terms.
@@ -226,6 +293,7 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int
 #endif

            // Write results
+
 #ifdef USE_OUTPUT_BUFFER_PER_WARP
            unsigned int offset = x + tgx + warp*cSim.stride;
            cSim.pBornSum[offset] += apos.w;
@@ -237,6 +305,7 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int
            offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
            cSim.pBornSum[offset] = sA[threadIdx.x].sum;
 #endif
+
        }

        pos++;

--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.cu
@@ -57,22 +57,14 @@ struct Atom {

 static __constant__ cudaGmxSimulation cSim;

-void SetCalculateGBVISoftcoreForces2Sim(gpuContext gpu)
+void SetCalculateGBVISoftcoreForces2Sim( freeEnergyGpuContext gpu)
 {
    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-    //(void) fprintf( stderr, "SetCalculateGBVISoftcoreForces2Sim called.\n" );
+    status = cudaMemcpyToSymbol(cSim, &gpu->gpuContext->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateGBVISoftcoreForces2Sim copy to cSim failed");
 }

-void GetCalculateGBVISoftcoreForces2Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-
-#include "kCalculateGBVISoftcoreAux.h"
+#include "kCalculateGBVIAux.h"

 /**
 * This file contains the kernel for evalauating the second stage of GBSA.  It is included
@@ -134,6 +126,8 @@ __global__ void kCalculateGBVISoftcoreForces2a_kernel()

 }

+#define TARGET 0
+
 // Include versions of the kernels for N^2 calculations.

 #define METHOD_NAME(a, b) a##N2##b
@@ -167,78 +161,83 @@ __global__ void kCalculateGBVISoftcoreForces2a_kernel()
 #define METHOD_NAME(a, b) a##PeriodicByWarp##b
 #include "kCalculateGBVISoftcoreForces2.h"

-void kCalculateGBVISoftcoreForces2(gpuContext gpu)
+void kCalculateGBVISoftcoreForces2( freeEnergyGpuContext freeEnergyGpu )
 {
-    //printf("kCalculateGBVISoftcoreForces2\n");
-    size_t numWithInteractions;
-
-#if 0
-    kClearForces(gpu);
-    (void) fprintf( stderr, "\nkCalculateGBVISoftcoreForces2: cleared force prior loop2\n" ); (void) fflush( stderr );
-    kCalculateGBVISoftcoreForces2a_kernel<<<gpu->sim.blocks, 384>>>();
-    (void) fprintf( stderr, "\ncalled kCalculateGBVISoftcoreForces2a\n" ); (void) fflush( stderr );
-    return;
-#endif
+    gpuContext gpu = freeEnergyGpu->gpuContext;

-    switch (gpu->sim.nonbondedMethod)
+    /*fprintf( stderr,"kCalculateGBVISoftcoreForces2 nonbondedMethod=%d bornForce2_blocks=%d bornForce2_threads_per_block=%d\n",
+             freeEnergyGpu->freeEnergySim.nonbondedMethod,
+             gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, gpu->psInteractionCount->_pSysData[0] ); fflush( stderr );
+    */
+
+    switch (freeEnergyGpu->freeEnergySim.nonbondedMethod)
    {
-        case NO_CUTOFF:
+        case FREE_ENERGY_NO_CUTOFF:
+
+#ifdef DEBUG
+int psize = 64;
+CUDAStream<float4>* pdE1 = new CUDAStream<float4>( psize, 1, "pdE");
+CUDAStream<float4>* pdE2 = new CUDAStream<float4>( psize, 1, "pdE");
+for( int ii = 0; ii < 32; ii++ ){
+
+pdE1->_pSysData[ii].x = 0.0f;
+pdE1->_pSysData[ii].y = 0.0f;
+pdE1->_pSysData[ii].z = 0.0f;
+pdE1->_pSysData[ii].w = 0.0f;
+
+pdE2->_pSysData[ii].x = 0.0f;
+pdE2->_pSysData[ii].y = 0.0f;
+pdE2->_pSysData[ii].z = 0.0f;
+pdE2->_pSysData[ii].w = 0.0f;
+}
+pdE1->Upload();
+pdE2->Upload();
+
            if (gpu->bOutputBufferPerWarp)
                kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits, pdE1->_pDevData, pdE2->_pDevData);
            else
                kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
-//(void) fprintf( stderr, "\nkCalculateGBVIForces2: Born radii/force forces warp=%u\n", gpu->bOutputBufferPerWarp ); (void) fflush( stderr );
-#define GBVI_DEBUG 0
-#if ( GBVI_DEBUG == 1 )
-                (void) fprintf( stderr, "\nkCalculateGBVISoftcoreForces2: Born radii/force forces:\n" ); (void) fflush( stderr );
-                gpu->psBornForce->Download();
-                gpu->psForce4->Download();
-                for( int ii = 0; ii < gpu->natoms; ii++ ){
-                    (void) fprintf( stderr, "%d bF=%14.6e Fa[%14.6e %14.6e %14.6e] Fb[%14.6e %14.6e %14.6e]\n",
-                                    ii,
-                                    gpu->psBornForce->_pSysStream[0][ii],
-                                    gpu->psForce4->_pSysStream[0][ii].x,
-                                    gpu->psForce4->_pSysStream[0][ii].y,
-                                    gpu->psForce4->_pSysStream[0][ii].z,
-                                    gpu->psForce4->_pSysStream[1][ii].x,
-                                    gpu->psForce4->_pSysStream[1][ii].y,
-                                    gpu->psForce4->_pSysStream[1][ii].z
-                                  );  
-                }   
-                for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms*2; ii++ ){
-                    (void) fprintf( stderr, "%d bF=%14.6e Fa[%14.6e %14.6e %14.6e %14.6e]\n",
-                                    ii,
-                                    gpu->psBornForce->_pSysStream[0][ii],
-                                    gpu->psForce4->_pSysStream[0][ii].x,
-                                    gpu->psForce4->_pSysStream[0][ii].y,
-                                    gpu->psForce4->_pSysStream[0][ii].z,
-                                    gpu->psForce4->_pSysStream[0][ii].w
-                                  );  
-                }   
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits, pdE1->_pDevData, pdE2->_pDevData);
+pdE1->Download();
+pdE2->Download();
+fprintf( stderr, "Pde\n" );
+for( int ii = 0; ii < 32; ii++ ){
+fprintf( stderr, "%4d %15.7e %15.7e %15.7e %15.7e    %15.7e %15.7e %15.7e %15.7e\n", ii, 
+         pdE1->_pSysData[ii].x, pdE1->_pSysData[ii].y, pdE1->_pSysData[ii].z, pdE1->_pSysData[ii].w,
+         pdE2->_pSysData[ii].x, pdE2->_pSysData[ii].y, pdE2->_pSysData[ii].z, pdE2->_pSysData[ii].w );
+}
+            break;
 #endif
-#undef GBVI_DEBUG

+            if (gpu->bOutputBufferPerWarp)
+                kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
+            else
+                kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
            break;
-        case CUTOFF:
-            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+
+        case FREE_ENERGY_CUTOFF:
+
            if (gpu->bOutputBufferPerWarp)
                kCalculateGBVISoftcoreCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
            else
                kCalculateGBVISoftcoreCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
            break;
-        case PERIODIC:
-            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+
+        case FREE_ENERGY_PERIODIC:
+
            if (gpu->bOutputBufferPerWarp)
                kCalculateGBVISoftcorePeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
            else
                kCalculateGBVISoftcorePeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
            break;
+
    }
    LAUNCHERROR("kCalculateGBVISoftcoreForces2");
 }
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.h
@@ -29,19 +29,29 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

-#include "kCalculateGBVISoftcoreAux.h"
+#include "kCalculateGBVIAux.h"

 /**
- * This file contains the kernel for evalauating the second stage of GBSA.  It is included
+ * This file contains the kernel for evaluating the second stage of GB/VI.  It is included
 * several times in kCalculateGBVIForces2.cu with different #defines to generate
 * different versions of the kernels.
 */

-__global__ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int* workUnit, unsigned int numWorkUnits)
+__global__
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1)
+#endif
+void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int* workUnit )
 {
+//METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int* workUnit, float4* pdE1, float4* pdE2 )
    extern __shared__ Atom sA[];
    unsigned int totalWarps   = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
    unsigned int warp         = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    unsigned int numWorkUnits = cSim.pInteractionCount[0];
    unsigned int pos          = warp*numWorkUnits/totalWarps;
    unsigned int end          = (warp+1)*numWorkUnits/totalWarps;
 #ifdef USE_CUTOFF
@@ -63,11 +73,17 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int
        float fb                        = cSim.pBornForce[i];
        unsigned int tbx                = threadIdx.x - tgx;
        unsigned int tj                 = tgx;
+
        Atom* psA                       = &sA[tbx];
+        sA[threadIdx.x].fx              = 0.0f;
+        sA[threadIdx.x].fy              = 0.0f;
+        sA[threadIdx.x].fz              = 0.0f;
+
        float3 af;
-        sA[threadIdx.x].fx = af.x   = 0.0f;
-        sA[threadIdx.x].fy = af.y   = 0.0f;
-        sA[threadIdx.x].fz = af.z   = 0.0f;
+        af.x                            = 0.0f;
+        af.y                            = 0.0f;
+        af.z                            = 0.0f;
+
        if (x == y) // Handle diagonals uniquely at 50% efficiency
        {
            // Read fixed atom data into registers and GRF
@@ -82,9 +98,11 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int

            for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1))
            {
+
                float dx                = psA[j].x - apos.x;
                float dy                = psA[j].y - apos.y;
                float dz                = psA[j].z - apos.z;
+
 #ifdef USE_PERIODIC
                dx                     -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                dy                     -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
@@ -95,31 +113,43 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int

                // Atom I Born forces and sum
                float dE                = psA[j].bornRadiusScaleFactor*getGBVI_dE2( r, ar.x, psA[j].sr, fb );
-               
-#if defined USE_PERIODIC
-                if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-                {
-                    dE              = 0.0f;
-                }
-#endif
 #if defined USE_CUTOFF
-                if (r2 > cSim.nonbondedCutoffSqr)
+                if (i >= cSim.atoms || x+j >= cSim.atoms || (i == (x+j)) || r2 > cSim.nonbondedCutoffSqr)
+#else
+                if(i >= cSim.atoms || x+j >= cSim.atoms || (i == (x+j)) )
+#endif
                {
                    dE              = 0.0f;
                }
-#endif
+
+/*
+if( i == TARGET ){
+    pdE1[x+j].x = dE;
+    pdE1[x+j].y = psA[j].bornRadiusScaleFactor;
+    pdE1[x+j].z = r;
+    pdE1[x+j].w = dE1;
+}
+if( (x+j) == TARGET ){
+    pdE2[i].x = dE;
+    pdE2[i].y = psA[j].bornRadiusScaleFactor;
+    pdE2[i].z = r;
+    pdE2[i].w = psA[j].sr-ar.x;
+}*/
                float d             = dx * dE;
                af.x               -= d;
                psA[j].fx          += d;
+
                d                   = dy * dE;
                af.y               -= d;
                psA[j].fy          += d;
+
                d                   = dz * dE;
                af.z               -= d;
                psA[j].fz          += d;
            }

            // Write results
+
            float4 of;
 #ifdef USE_OUTPUT_BUFFER_PER_WARP
            unsigned int offset         = x + tgx + warp*cSim.stride;
@@ -181,18 +211,15 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int

                    float dE                = psA[tj].bornRadiusScaleFactor*getGBVI_dE2( r, ar.x, psA[tj].sr, fb );

-#if defined USE_PERIODIC
+#if defined USE_CUTOFF
                    if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-                    {
-                        dE                  = 0.0f;
-                    }
+#else
+                    if (i >= cSim.atoms || y+tj >= cSim.atoms )
 #endif
-#if defined USE_CUTOFF
-                    if (r2 > cSim.nonbondedCutoffSqr)
                    {
                        dE                  = 0.0f;
                    }
-#endif
+

                    float d                 = dx * dE;
                    af.x                   -= d;
@@ -207,18 +234,15 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int
                    // Atom J Born sum term
                    dE                      = ar.w*getGBVI_dE2( r, psA[tj].r, ar.y, psA[tj].fb );

-#ifdef USE_PERIODIC
+#if defined USE_CUTOFF
                    if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-                    {
-                        dE                  = 0.0f;
-                    }
+#else
+                    if (i >= cSim.atoms || y+tj >= cSim.atoms )
 #endif
-#if defined USE_CUTOFF
-                    if (r2 > cSim.nonbondedCutoffSqr)
                    {
                        dE                  = 0.0f;
                    }
-#endif
+
                    dx                     *= dE;
                    dy                     *= dE;
                    dz                     *= dE;
@@ -254,18 +278,14 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int
                        // Interleaved Atom I and J Born Forces and sum components
                        float dE                = psA[j].bornRadiusScaleFactor*getGBVI_dE2( r, ar.x, psA[j].sr, fb );

-#if defined USE_PERIODIC
+#if defined USE_CUTOFF
                        if (i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-                        {
-                            dE                  = 0.0f;
-                        }
+#else
+                        if (i >= cSim.atoms || y+j >= cSim.atoms )
 #endif
-#if defined USE_CUTOFF
-                        if (r2 > cSim.nonbondedCutoffSqr)
                        {
                            dE                  = 0.0f;
                        }
-#endif

                        float d                 = dx * dE;
                        af.x                   -= d;
@@ -280,18 +300,15 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int
                        // Atom J Born sum term
                        dE                      = ar.w*getGBVI_dE2( r, psA[j].r, ar.y, psA[j].fb );

-#ifdef USE_PERIODIC
+#if defined USE_CUTOFF
                        if (i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-                        {
-                            dE                  = 0.0f;
-                        }
+#else
+                        if (i >= cSim.atoms || y+j >= cSim.atoms )
 #endif
-#if defined USE_CUTOFF
-                        if (r2 > cSim.nonbondedCutoffSqr)
                        {
                            dE                  = 0.0f;
                        }
-#endif
+
                        dx                     *= dE;
                        dy                     *= dE;
                        dz                     *= dE;

--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateLocalSoftcoreForces.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateLocalSoftcoreForces.cu
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateNonbondedSoftcore.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateNonbondedSoftcore.cu
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateNonbondedSoftcore.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateNonbondedSoftcore.h
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreBornSum.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreBornSum.cu
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreBornSum.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreBornSum.h
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreForces2.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreForces2.cu
@@ -34,7 +34,7 @@
 using namespace std;

 #include "gputypes.h"
-#include "GpuObcGbsaSoftcore.h"
+#include "freeEnergyGpuTypes.h"

 struct Atom {
    float x;
@@ -49,38 +49,19 @@ struct Atom {
    float fb;
 };

-struct cudaFreeEnergySimulationObcGbsaSoftcore {
-    float* pNonPolarScalingFactors;
-};
-struct cudaFreeEnergySimulationObcGbsaSoftcore gbsaSimObc2;
-
 static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaFreeEnergySimulationObcGbsaSoftcore gbsaSimDev;
+static __constant__ cudaFreeEnergyGmxSimulation feSimDev;

 extern "C"
-void SetCalculateObcGbsaSoftcoreForces2Sim(gpuContext gpu)
+void SetCalculateObcGbsaSoftcoreForces2Sim( freeEnergyGpuContext freeEnergyGpu )
 {
    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
+    status = cudaMemcpyToSymbol(cSim, &freeEnergyGpu->gpuContext->sim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateObcGbsaSoftcoreForces2Sim copy to cSim failed");

-extern "C" 
-void SetCalculateObcGbsaSoftcoreNonPolarScalingFactorsObc2Sim( float* nonPolarScalingFactors )
-{
-    cudaError_t status;
-    gbsaSimObc2.pNonPolarScalingFactors = nonPolarScalingFactors;
-    status                              = cudaMemcpyToSymbol(gbsaSimDev, &gbsaSimObc2, sizeof(cudaFreeEnergySimulationObcGbsaSoftcore));
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateObcGbsaSoftcoreNonPolarScalingFactorsObc2Sim");
+    status = cudaMemcpyToSymbol( feSimDev, &freeEnergyGpu->freeEnergySim, sizeof(cudaFreeEnergyGmxSimulation));
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateObcGbsaSoftcoreForces2Sim copy to feSimDev failed");

-    //(void) fprintf( stderr, "In SetCalculateObcGbsaSoftcoreNonPolarScalingFactorsObc2Sim\n" );
-}
-
-void GetCalculateObcGbsaSoftcoreForces2Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
 }

 // Include versions of the kernels for N^2 calculations.
@@ -116,15 +97,14 @@ void GetCalculateObcGbsaSoftcoreForces2Sim(gpuContext gpu)
 #define METHOD_NAME(a, b) a##PeriodicByWarp##b
 #include "kCalculateObcGbsaSoftcoreForces2.h"

-void kCalculateObcGbsaSoftcoreForces2(gpuContext gpu)
+void kCalculateObcGbsaSoftcoreForces2( freeEnergyGpuContext freeEnergyGpu )
 {
    //printf("kCalculateObcGbsaSoftcoreForces2\n");
-    //fprintf( stderr, "kCalculateObcGbsaSoftcoreForces2 nonbondedMethod=%d warp=%d\n", gpu->sim.nonbondedMethod, gpu->bOutputBufferPerWarp);
-//fprintf( stderr, "kCalculateObcGbsaSoftcoreForces2 nonbondedMethod=%d calling kReduceForces\n", gpu->sim.nonbondedMethod);
-//kReduceForces(gpu);
-    switch (gpu->sim.nonbondedMethod)
+    gpuContext gpu                     = freeEnergyGpu->gpuContext;
+    switch (freeEnergyGpu->freeEnergySim.nonbondedMethod)
    {
-        case NO_CUTOFF:
+        case FREE_ENERGY_NO_CUTOFF:
+
            if (gpu->bOutputBufferPerWarp)
                kCalculateObcGbsaSoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
@@ -132,7 +112,9 @@ void kCalculateObcGbsaSoftcoreForces2(gpuContext gpu)
                kCalculateObcGbsaSoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
            break;
-        case CUTOFF:
+
+        case FREE_ENERGY_CUTOFF:
+
            if (gpu->bOutputBufferPerWarp)
                kCalculateObcGbsaSoftcoreCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
@@ -140,7 +122,9 @@ void kCalculateObcGbsaSoftcoreForces2(gpuContext gpu)
                kCalculateObcGbsaSoftcoreCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
            break;
-        case PERIODIC:
+
+        case FREE_ENERGY_PERIODIC:
+
            if (gpu->bOutputBufferPerWarp)
                kCalculateObcGbsaSoftcorePeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);

--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreForces2.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreForces2.h
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kSoftcoreLJ.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kSoftcoreLJ.h
--- a/plugins/freeEnergy/platforms/cuda/tests/CMakeLists.txt
+++ b/plugins/freeEnergy/platforms/cuda/tests/CMakeLists.txt
--- a/plugins/freeEnergy/platforms/cuda/tests/TestCudaGBVISoftcoreForce.cpp
+++ b/plugins/freeEnergy/platforms/cuda/tests/TestCudaGBVISoftcoreForce.cpp
--- a/plugins/freeEnergy/platforms/cuda/tests/TestCudaLJSoftcoreForce.cpp
+++ b/plugins/freeEnergy/platforms/cuda/tests/TestCudaLJSoftcoreForce.cpp
--- a/plugins/freeEnergy/platforms/cuda/tests/TestCudaOBCSoftcoreForce.cpp
+++ b/plugins/freeEnergy/platforms/cuda/tests/TestCudaOBCSoftcoreForce.cpp
--- a/plugins/freeEnergy/platforms/cuda/tests/TestCudaSoftcoreForce.h
+++ b/plugins/freeEnergy/platforms/cuda/tests/TestCudaSoftcoreForce.h
--- a/plugins/freeEnergy/platforms/cuda/tests/TstFreeEnergyCudaUsingParameterFile.cpp
+++ b/plugins/freeEnergy/platforms/cuda/tests/TstFreeEnergyCudaUsingParameterFile.cpp