Mods to remove requirement that nonbonded force buffer size should be doubled for GB/VI and OBC

7b89b1bf · Mark Friedrichs · 67b261f6 · 7b89b1bf · 7b89b1bf · 7b89b1bf
Commit 7b89b1bf authored Nov 11, 2009 by Mark Friedrichs
7 changed files
--- a/plugins/freeEnergy/platforms/cuda/src/CudaFreeEnergyKernels.cpp
+++ b/plugins/freeEnergy/platforms/cuda/src/CudaFreeEnergyKernels.cpp
@@ -386,6 +386,8 @@ void CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::executeForces(ContextImpl&
 // ---------------------------------------------------------------------------------------

    _gpuContext* gpu = data.gpu;
+//    static int call  = 0;
+//    call++;

    // write array, ... address's to board

@@ -398,7 +400,6 @@ void CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::executeForces(ContextImpl&
        }
        SetCalculateLocalSoftcoreGpuSim( gpu );
        SetCalculateCDLJSoftcoreGpuSim( gpu );
-// (void) fprintf( log, "Calling SetCalculateLocalSoftcoreGpuSim\n" ); fflush( stderr );

        // flip strides (unsure if this is needed)

@@ -413,20 +414,20 @@ void CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::executeForces(ContextImpl&
            }
        }
 #endif
+
    }
 
    // calculate nonbonded ixns here, only if implicit solvent is inactive

    if ( !getIncludeGBSA() && !getIncludeGBVI() ) {
-//kClearForces(gpu);
        kCalculateCDLJSoftcoreForces(gpu);
    }
  
    // local LJ-14 forces

    kCalculateLocalSoftcoreForces(gpu);
+//kPrintForces(gpu, "Post kCalculateLocalSoftcoreForces", call );
 //kReduceForces(gpu);
-//exit(0);
 }

 double CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::executeEnergy(ContextImpl& context) {
@@ -527,6 +528,7 @@ void CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::executeForces(ContextImpl& co

    _gpuContext* gpu = data.gpu;
    int debug        = 1;
+    static int call  = 0;

    // send address's of arrays, ... to device on first call
    // required since force/energy buffers not set when CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::initialize() was called
@@ -535,8 +537,6 @@ void CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::executeForces(ContextImpl& co
       setSim++;
       SetCalculateObcGbsaSoftcoreBornSumSim( gpu );
       SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( gpu );
-
-       //SetCalculateObcGbsaForces2Sim( gpu );
       SetCalculateObcGbsaSoftcoreForces2Sim( gpu );
    }

@@ -546,8 +546,11 @@ void CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::executeForces(ContextImpl& co
    // calculate Born radii and first loop of Obc forces

    if( debug && log ){
-        (void) fprintf( stderr, "\n%s: calling kCalculateCDLJObcGbsaSoftcoreForces1\n", methodName.c_str() );
-        (void) fflush( stderr );
+        call++;
+        if( log ){
+            (void) fprintf( log, "\n%s: calling kCalculateCDLJObcGbsaSoftcoreForces1\n", methodName.c_str() );
+            (void) fflush( log );
+        }
    }

    kClearBornForces(gpu);
@@ -555,9 +558,10 @@ void CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::executeForces(ContextImpl& co
    kReduceObcGbsaBornSum(gpu);
    kCalculateCDLJObcGbsaSoftcoreForces1(gpu);

+//kPrintForces(gpu, "Post kCalculateCDLJObcGbsaSoftcoreForces1", call );
    if( debug && log ){
-        (void) fprintf( stderr, "\n%s: calling kReduceObcGbsaBornForces\n", methodName.c_str()  );
-        (void) fflush( stderr );
+        (void) fprintf( log, "\n%s: calling kReduceObcGbsaBornForces\n", methodName.c_str()  );
+        (void) fflush( log );
    }

    // compute Born forces
@@ -565,13 +569,12 @@ void CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::executeForces(ContextImpl& co
    kReduceObcGbsaSoftcoreBornForces(gpu);

    if( debug && log ){
-        (void) fprintf( stderr, "\n%s calling kCalculateObcGbsaForces2\n", methodName.c_str() );
-        (void) fflush( stderr );
+        (void) fprintf( log, "\n%s calling kCalculateObcGbsaForces2\n", methodName.c_str() );
+        (void) fflush( log );
    }

    // second loop of Obc GBSA forces

-    //kCalculateObcGbsaForces2(gpu);
    kCalculateObcGbsaSoftcoreForces2(gpu);
 }

@@ -668,6 +671,7 @@ void CudaFreeEnergyCalcGBVISoftcoreForceKernel::executeForces(ContextImpl& conte

    _gpuContext* gpu = data.gpu;
    int debug        = 1;
+    static int call  = 0;

    // send address's of arrays, ... to device on first call
    // required since force/energy buffers not set when CudaFreeEnergyCalcGBVISoftcoreForceKernel::initialize() was called
@@ -676,36 +680,40 @@ void CudaFreeEnergyCalcGBVISoftcoreForceKernel::executeForces(ContextImpl& conte
       setSim++;
       SetCalculateGBVISoftcoreBornSumGpuSim( gpu );
       SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( gpu );
-       SetCalculateGBVIForces2Sim( gpu );
+       SetCalculateGBVISoftcoreForces2Sim( gpu );
    }

-    // required!!
+    // required?
+
    gpu->bRecalculateBornRadii = true; // fixed

-//kClearForces(gpu);
    // calculate Born radii and first loop of GB/VI forces

    if( debug && log ){
-        (void) fprintf( stderr, "\n%s: calling kCalculateCDLJObcGbsaSoftcoreForces1 & %s\n", methodName.c_str(),
-                        getQuinticScaling() ? "kReduceGBVIBornSumQuinticScaling" : "kReduceGBVIBornSum" );
-        (void) fflush( stderr );
+        call++;
+        if( log ){
+            (void) fprintf( log, "\n%s: calling kCalculateCDLJObcGbsaSoftcoreForces1 & %s\n", methodName.c_str(),
+                            getQuinticScaling() ? "kReduceGBVIBornSumQuinticScaling" : "kReduceGBVIBornSum" );
+            (void) fflush( log );
+        }
    }

    kClearBornForces(gpu);
    kCalculateGBVISoftcoreBornSum(gpu);

    if( getQuinticScaling() ){
-//(void) fprintf( stderr, "\n%s: calling kReduceGBVIBornSumQuinticScaling\n", methodName.c_str() ); fflush( stderr );
        kReduceGBVIBornSumQuinticScaling(gpu, gpuGBVISoftcore );
    } else {
        kReduceGBVIBornSum(gpu);
    }
    kCalculateCDLJObcGbsaSoftcoreForces1(gpu);

+//kPrintForces(gpu, "Post kCalculateCDLJObcGbsaSoftcoreForces1", call );
+
    if( debug && log ){
-        (void) fprintf( stderr, "\n%s: calling %s\n", methodName.c_str(),
+        (void) fprintf( log, "\n%s: calling %s\n", methodName.c_str(),
                        getQuinticScaling() ? "kReduceGBVIBornForcesQuinticScaling" : "kReduceObcGbsaBornForces" );
-        (void) fflush( stderr );
+        (void) fflush( log );
    }

    // compute Born forces
@@ -719,14 +727,13 @@ void CudaFreeEnergyCalcGBVISoftcoreForceKernel::executeForces(ContextImpl& conte
    }

    if( debug && log ){
-        (void) fprintf( stderr, "\n%s: calling kCalculateGBVIForces2\n", methodName.c_str() );
-        (void) fflush( stderr );
+        (void) fprintf( log, "\n%s: calling kCalculateGBVIForces2\n", methodName.c_str() );
+        (void) fflush( log );
    }

    // second loop of GB/VI forces

-    kCalculateGBVIForces2(gpu);
-//kReduceForces(gpu);
+    kCalculateGBVISoftcoreForces2(gpu);
 }

 double CudaFreeEnergyCalcGBVISoftcoreForceKernel::executeEnergy(ContextImpl& context) {

--- a/plugins/freeEnergy/platforms/cuda/src/kernels/GpuFreeEnergyCudaKernels.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/GpuFreeEnergyCudaKernels.h
@@ -103,11 +103,18 @@ void SetCalculateGBVISoftcoreBornSumGpuSim( gpuContext gpu);
 extern "C"
 void SetCalculateGBVISoftcoreSupplementarySim( GpuGBVISoftcore* gpuGBVISoftcore );

+extern "C"
+void SetCalculateGBVISoftcoreForces2Sim(gpuContext gpu);
+
+extern "C"
+void GetCalculateGBVISoftcoreForces2Sim(gpuContext gpu);
+
 // kernel calls to device

 extern void kReduceGBVIBornSumQuinticScaling( gpuContext gpu, GpuGBVISoftcore* gpuGBVISoftcore );
 extern void kCalculateGBVISoftcoreBornSum( gpuContext gpu );
 extern void kReduceGBVIBornForcesQuinticScaling( gpuContext gpu );
+extern void kCalculateGBVISoftcoreForces2( gpuContext gpu );

 // Obc softcore

@@ -156,6 +163,7 @@ extern void kCalculateObcGbsaSoftcoreBornSum( gpuContext gpu );

 // this method is not needed; the OpenMM version can be used
 extern void kCalculateObcGbsaSoftcoreForces2( gpuContext gpu );
+extern void kPrintForces( gpuContext gpu, std::string idString, int call );


 // shared

--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreAux.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreAux.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Mark Friedrichs                                                   *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#ifndef __GpuGBVIAUX_H__
+#define __GpuGBVIAUX_H__
+
+/**
+ * This file contains subroutines used in evaluating quantities associated w/ the GB/VI function
+ */
+
+__device__ float getGBVI_L( float r, float x, float S )
+{
+
+   float rInv   = 1.0f/r;
+
+   float xInv   = 1.0f/x;
+   float xInv2  = xInv*xInv;
+   float diff2  = (r + S)*(r - S);
+
+   return (1.5f*xInv2)*( (0.25f*rInv) - (xInv/3.0f) + (0.125f*diff2*xInv2*rInv) );
+}
+
+__device__ float getGBVI_Volume( float r_ij, float R, float S )
+{
+
+     float upperBound        = r_ij + S; 
+     float rdiffS            = r_ij - S; 
+     float lowerBound        = R > rdiffS ? R : rdiffS;
+     float L_upper           = getGBVI_L( r_ij, upperBound, S );
+     float L_lower           = getGBVI_L( r_ij, lowerBound, S );
+     float mask              = r_ij < (R - S) ? 0.0f : 1.0f;  
+     float addOn             = r_ij < (S - R) ? (1.0f/(R*R*R)) : 0.0f;  
+
+     return (mask*( L_upper - L_lower ) + addOn);
+
+}
+
+__device__ float getGBVI_dL_dr( float r, float x, float S )
+{
+
+   float rInv   = 1.0f/r;
+   float rInv2  = rInv*rInv;
+
+   float xInv   = 1.0f/x;
+   float xInv2  = xInv*xInv;
+   float xInv3  = xInv2*xInv;
+   
+   float diff2  = (r + S)*(r - S);
+
+   return ( (-1.5f*xInv2*rInv2)*( 0.25f + 0.125f*diff2*xInv2 ) + 0.375f*xInv3*xInv );
+   //return 0.0f;
+
+}
+
+__device__ float getGBVI_dL_dx( float r, float x, float S )
+{
+
+   float rInv   = 1.0f/r;
+
+   float xInv   = 1.0f/x;
+   float xInv2  = xInv*xInv;
+   float xInv3  = xInv2*xInv;
+
+   float diff   = (r + S)*(r - S);
+
+   return ( (-1.5f*xInv3)*( (0.5f*rInv) - xInv + (0.5f*diff*xInv2*rInv) ));
+
+
+}
+
+__device__ float getGBVI_dE2( float r, float R, float S, float bornForce )
+{
+
+    float diff              = S - R;
+    float absDiff           = fabsf( S - R );
+    float dE                = getGBVI_dL_dr( r, r+S, S ) + getGBVI_dL_dx( r, r+S, S );
+    float mask;
+    float lowerBound;
+    if( (R > (r - S)) && (absDiff < r) ){
+        mask       = 0.0f;
+        lowerBound = R;
+    } else {
+        mask       = 1.0f;
+        lowerBound = (r - S);  
+    }   
+    dE                    -= getGBVI_dL_dr( r, lowerBound, S ) + mask*getGBVI_dL_dx( r, lowerBound, S );
+    dE                     = (absDiff >= r) && r >= diff ? 0.0f : dE; 
+    dE                    *= ( (r > 1.0e-08f) ? (bornForce/r) : 0.0f);
+
+    return (-dE);
+
+}
+
+__device__ float getGBVIBornForce2( float bornRadius, float R, float bornForce, float gamma )
+{ 
+    float ratio                     = (R/bornRadius);
+    float returnBornForce           = bornForce + (3.0f*gamma*ratio*ratio*ratio)/bornRadius; // 'cavity' term
+    float br2                       = bornRadius*bornRadius;
+          returnBornForce          *= (1.0f/3.0f)*br2*br2;
+
+   return returnBornForce;
+
+}
+
+#endif // __GpuGBVIAUX_H__
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+#include "gputypes.h"
+#include "cudaKernels.h"
+#include "GpuFreeEnergyCudaKernels.h"
+
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float r;
+    float sr;
+    float fx;
+    float fy;
+    float fz;
+    float fb;
+};
+
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetCalculateGBVISoftcoreForces2Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetCalculateGBVISoftcoreForces2Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+#include "kCalculateGBVISoftcoreAux.h"
+
+/**
+ * This file contains the kernel for evalauating the second stage of GBSA.  It is included
+ * several times in kCalculateGBVIForces2.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+
+__global__ void kCalculateGBVISoftcoreForces2a_kernel()
+{
+    unsigned int pos  = (blockIdx.x * blockDim.x + threadIdx.x);
+    if( pos >=  cSim.atoms )return;
+
+    float4 apos                     = cSim.pPosq[pos];
+    float4 ar                       = cSim.pGBVIData[pos];
+    float fb                        = cSim.pBornForce[pos];
+    unsigned int posJ               = 0;
+    float4 force;
+    force.x = force.y = force.z = force.w = 0.0f;
+    while ( posJ < cSim.atoms )
+    {
+
+        float4 aposJ                = cSim.pPosq[posJ];
+        float4 arJ                  = cSim.pGBVIData[posJ];
+        float fbJ                   = cSim.pBornForce[posJ];
+
+        float dx                    = aposJ.x - apos.x;
+        float dy                    = aposJ.y - apos.y;
+        float dz                    = aposJ.z - apos.z;
+
+        float r2                    = dx * dx + dy * dy + dz * dz;
+        float r                     = sqrt(r2);
+
+        float dE                    = getGBVI_dE2( r, ar.x, arJ.y, fb );
+        dE                          = r > 1.0e-08f ? dE : 0.0f;
+
+//dx = dy = dz = 1.0f;
+        float d                     = dx*dE;
+        force.x                    -= d;
+        d                           = dy*dE;
+        force.y                    -= d;
+        d                           = dz*dE;
+        force.z                    -= d;
+#if 1
+        dE                          = getGBVI_dE2( r, arJ.x, ar.y, fbJ );
+        dE                          = r > 1.0e-08f ? dE : 0.0f;
+        d                           = dx*dE;
+        force.x                    -= d;
+        d                           = dy*dE;
+        force.y                    -= d;
+        d                           = dz*dE;
+        force.z                    -= d;
+#endif
+
+        posJ                       += 1;
+    }
+
+    // Write results
+    cSim.pForce4a[pos]             = force;
+
+}
+
+// Include versions of the kernels for N^2 calculations.
+
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateGBVISoftcoreForces2.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateGBVISoftcoreForces2.h"
+
+// Include versions of the kernels with cutoffs.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_CUTOFF
+#define METHOD_NAME(a, b) a##Cutoff##b
+#include "kCalculateGBVISoftcoreForces2.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
+#include "kCalculateGBVISoftcoreForces2.h"
+
+// Include versions of the kernels with periodic boundary conditions.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_PERIODIC
+#define METHOD_NAME(a, b) a##Periodic##b
+#include "kCalculateGBVISoftcoreForces2.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##PeriodicByWarp##b
+#include "kCalculateGBVISoftcoreForces2.h"
+
+void kCalculateGBVISoftcoreForces2(gpuContext gpu)
+{
+    //printf("kCalculateGBVISoftcoreForces2\n");
+    size_t numWithInteractions;
+
+#if 0
+    kClearForces(gpu);
+    (void) fprintf( stderr, "\nkCalculateGBVISoftcoreForces2: cleared force prior loop2\n" ); (void) fflush( stderr );
+    kCalculateGBVISoftcoreForces2a_kernel<<<gpu->sim.blocks, 384>>>();
+    (void) fprintf( stderr, "\ncalled kCalculateGBVISoftcoreForces2a\n" ); (void) fflush( stderr );
+    return;
+#endif
+
+    switch (gpu->sim.nonbondedMethod)
+    {
+        case NO_CUTOFF:
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
+            else
+                kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
+//(void) fprintf( stderr, "\nkCalculateGBVIForces2: Born radii/force forces warp=%u\n", gpu->bOutputBufferPerWarp ); (void) fflush( stderr );
+#define GBVI_DEBUG 0
+#if ( GBVI_DEBUG == 1 )
+                (void) fprintf( stderr, "\nkCalculateGBVISoftcoreForces2: Born radii/force forces:\n" ); (void) fflush( stderr );
+                gpu->psBornForce->Download();
+                gpu->psForce4->Download();
+                for( int ii = 0; ii < gpu->natoms; ii++ ){
+                    (void) fprintf( stderr, "%d bF=%14.6e Fa[%14.6e %14.6e %14.6e] Fb[%14.6e %14.6e %14.6e]\n",
+                                    ii,
+                                    gpu->psBornForce->_pSysStream[0][ii],
+                                    gpu->psForce4->_pSysStream[0][ii].x,
+                                    gpu->psForce4->_pSysStream[0][ii].y,
+                                    gpu->psForce4->_pSysStream[0][ii].z,
+                                    gpu->psForce4->_pSysStream[1][ii].x,
+                                    gpu->psForce4->_pSysStream[1][ii].y,
+                                    gpu->psForce4->_pSysStream[1][ii].z
+                                  );  
+                }   
+                for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms*2; ii++ ){
+                    (void) fprintf( stderr, "%d bF=%14.6e Fa[%14.6e %14.6e %14.6e %14.6e]\n",
+                                    ii,
+                                    gpu->psBornForce->_pSysStream[0][ii],
+                                    gpu->psForce4->_pSysStream[0][ii].x,
+                                    gpu->psForce4->_pSysStream[0][ii].y,
+                                    gpu->psForce4->_pSysStream[0][ii].z,
+                                    gpu->psForce4->_pSysStream[0][ii].w
+                                  );  
+                }   
+#endif
+#undef GBVI_DEBUG
+
+            break;
+        case CUTOFF:
+            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateGBVISoftcoreCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            else
+                kCalculateGBVISoftcoreCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            break;
+        case PERIODIC:
+            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateGBVISoftcorePeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            else
+                kCalculateGBVISoftcorePeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            break;
+    }
+    LAUNCHERROR("kCalculateGBVISoftcoreForces2");
+}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "kCalculateGBVIAux.h"
+
+/**
+ * This file contains the kernel for evalauating the second stage of GBSA.  It is included
+ * several times in kCalculateGBVIForces2.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+
+__global__ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int* workUnit, unsigned int numWorkUnits)
+{
+    extern __shared__ Atom sA[];
+    unsigned int totalWarps  = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
+    unsigned int warp        = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    unsigned int pos         = warp*numWorkUnits/totalWarps;
+    unsigned int end         = (warp+1)*numWorkUnits/totalWarps;
+#ifdef USE_CUTOFF
+    float3* tempBuffer       = (float3*) &sA[cSim.bornForce2_threads_per_block];
+#endif
+
+    unsigned int lasty = -0xFFFFFFFF;
+    while (pos < end)
+    {
+
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x                  = workUnit[pos];
+        unsigned int y                  = ((x >> 2) & 0x7fff) << GRIDBITS;
+        x                               = (x >> 17) << GRIDBITS;
+        unsigned int tgx                = threadIdx.x & (GRID - 1);
+        unsigned int i                  = x + tgx;
+        float4 apos                     = cSim.pPosq[i];
+        float4 ar                       = cSim.pGBVIData[i];
+        float fb                        = cSim.pBornForce[i];
+        unsigned int tbx                = threadIdx.x - tgx;
+        unsigned int tj                 = tgx;
+        Atom* psA                       = &sA[tbx];
+        float3 af;
+        sA[threadIdx.x].fx = af.x   = 0.0f;
+        sA[threadIdx.x].fy = af.y   = 0.0f;
+        sA[threadIdx.x].fz = af.z   = 0.0f;
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        {
+            // Read fixed atom data into registers and GRF
+
+            sA[threadIdx.x].x           = apos.x;
+            sA[threadIdx.x].y           = apos.y;
+            sA[threadIdx.x].z           = apos.z;
+            sA[threadIdx.x].r           = ar.x;
+            sA[threadIdx.x].sr          = ar.y;
+            sA[threadIdx.x].fb          = fb;
+
+            for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1))
+            {
+                float dx                = psA[j].x - apos.x;
+                float dy                = psA[j].y - apos.y;
+                float dz                = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                float r2                = dx * dx + dy * dy + dz * dz;
+                float r                 = sqrt(r2);
+
+                // Atom I Born forces and sum
+                float dE                = getGBVI_dE2( r, ar.x, psA[j].sr, fb );
+               
+#if defined USE_PERIODIC
+                if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+                {
+                    dE              = 0.0f;
+                }
+#endif
+#if defined USE_CUTOFF
+                if (r2 > cSim.nonbondedCutoffSqr)
+                {
+                    dE              = 0.0f;
+                }
+#endif
+                float d             = dx * dE;
+                af.x               -= d;
+                psA[j].fx          += d;
+                d                   = dy * dE;
+                af.y               -= d;
+                psA[j].fy          += d;
+                d                   = dz * dE;
+                af.z               -= d;
+                psA[j].fz          += d;
+            }
+
+            // Write results
+            float4 of;
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset         = x + tgx + warp*cSim.stride;
+            of                          = cSim.pForce4[offset];
+            of.x                       += af.x + sA[threadIdx.x].fx;
+            of.y                       += af.y + sA[threadIdx.x].fy;
+            of.z                       += af.z + sA[threadIdx.x].fz;
+            cSim.pForce4[offset]        = of;
+#else
+            unsigned int offset         = x + tgx + (x >> GRIDBITS) * cSim.stride;
+            of                          = cSim.pForce4[offset];
+            of.x                       += af.x + sA[threadIdx.x].fx;
+            of.y                       += af.y + sA[threadIdx.x].fy;
+            of.z                       += af.z + sA[threadIdx.x].fz;
+            of.w                        = 0.0f;
+            cSim.pForce4[offset]        = of;
+#endif
+        }
+        else
+        {
+            // Read fixed atom data into registers and GRF
+            if (lasty != y)
+            {
+                unsigned int j              = y + tgx;
+                float4 temp                 = cSim.pPosq[j];
+                float4 temp1                = cSim.pGBVIData[j];
+                float fb                    = cSim.pBornForce[j];
+                sA[threadIdx.x].fb          = fb;
+                sA[threadIdx.x].x           = temp.x;
+                sA[threadIdx.x].y           = temp.y;
+                sA[threadIdx.x].z           = temp.z;
+                sA[threadIdx.x].r           = temp1.x;
+                sA[threadIdx.x].sr          = temp1.y;
+            }
+#ifdef USE_CUTOFF
+            unsigned int flags = cSim.pInteractionFlag[pos];
+            if (flags == 0)
+            {
+                // No interactions in this block.
+            }
+            else if (flags == 0xFFFFFFFF)
+#endif
+            {
+                // Compute all interactions within this block.
+
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    float dx                = psA[tj].x - apos.x;
+                    float dy                = psA[tj].y - apos.y;
+                    float dz                = psA[tj].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    float r2                = dx * dx + dy * dy + dz * dz;
+                    float r                 = sqrt(r2);
+
+                    float dE                = getGBVI_dE2( r, ar.x, psA[tj].sr, fb );
+
+#if defined USE_PERIODIC
+                    if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+                    {
+                        dE                  = 0.0f;
+                    }
+#endif
+#if defined USE_CUTOFF
+                    if (r2 > cSim.nonbondedCutoffSqr)
+                    {
+                        dE                  = 0.0f;
+                    }
+#endif
+
+                    float d                 = dx * dE;
+                    af.x                   -= d;
+                    psA[tj].fx             += d;
+                    d                       = dy * dE;
+                    af.y                   -= d;
+                    psA[tj].fy             += d;
+                    d                       = dz * dE;
+                    af.z                   -= d;
+                    psA[tj].fz             += d;
+
+                    // Atom J Born sum term
+                    dE                      = getGBVI_dE2( r, psA[tj].r, ar.y, psA[tj].fb );
+
+#ifdef USE_PERIODIC
+                    if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+                    {
+                        dE                  = 0.0f;
+                    }
+#endif
+#if defined USE_CUTOFF
+                    if (r2 > cSim.nonbondedCutoffSqr)
+                    {
+                        dE                  = 0.0f;
+                    }
+#endif
+                    dx                     *= dE;
+                    dy                     *= dE;
+                    dz                     *= dE;
+                    psA[tj].fx             += dx;
+                    psA[tj].fy             += dy;
+                    psA[tj].fz             += dz;
+                    af.x                   -= dx;
+                    af.y                   -= dy;
+                    af.z                   -= dz;
+                    tj                      = (tj + 1) & (GRID - 1);
+                }
+            }
+#ifdef USE_CUTOFF
+            else
+            {
+                // Compute only a subset of the interactions in this block.
+
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    if ((flags&(1<<j)) != 0)
+                    {
+                        float dx                = psA[j].x - apos.x;
+                        float dy                = psA[j].y - apos.y;
+                        float dz                = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                        dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                        dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                        dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                        float r2                = dx * dx + dy * dy + dz * dz;
+                        float r                 = sqrt(r2);
+
+                        // Interleaved Atom I and J Born Forces and sum components
+                        float dE                = getGBVI_dE2( r, ar.x, psA[j].sr, fb );
+
+#if defined USE_PERIODIC
+                        if (i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+                        {
+                            dE                  = 0.0f;
+                        }
+#endif
+#if defined USE_CUTOFF
+                        if (r2 > cSim.nonbondedCutoffSqr)
+                        {
+                            dE                  = 0.0f;
+                        }
+#endif
+
+                        float d                 = dx * dE;
+                        af.x                   -= d;
+                        tempBuffer[threadIdx.x].x = d;
+                        d                       = dy * dE;
+                        af.y                   -= d;
+                        tempBuffer[threadIdx.x].y = d;
+                        d                       = dz * dE;
+                        af.z                   -= d;
+                        tempBuffer[threadIdx.x].z = d;
+
+                        // Atom J Born sum term
+                        dE                      = getGBVI_dE2( r, psA[j].r, ar.y, psA[j].fb );
+
+#ifdef USE_PERIODIC
+                        if (i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+                        {
+                            dE                  = 0.0f;
+                        }
+#endif
+#if defined USE_CUTOFF
+                        if (r2 > cSim.nonbondedCutoffSqr)
+                        {
+                            dE                  = 0.0f;
+                        }
+#endif
+                        dx                     *= dE;
+                        dy                     *= dE;
+                        dz                     *= dE;
+                        tempBuffer[threadIdx.x].x += dx;
+                        tempBuffer[threadIdx.x].y += dy;
+                        tempBuffer[threadIdx.x].z += dz;
+                        af.x                   -= dx;
+                        af.y                   -= dy;
+                        af.z                   -= dz;
+
+                        // Sum the forces on atom j.
+
+                        if (tgx % 2 == 0)
+                        {
+                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
+                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
+                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
+                        }
+                        if (tgx % 4 == 0)
+                        {
+                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
+                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
+                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
+                        }
+                        if (tgx % 8 == 0)
+                        {
+                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
+                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
+                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
+                        }
+                        if (tgx % 16 == 0)
+                        {
+                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
+                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
+                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
+                        }
+                        if (tgx == 0)
+                        {
+                            psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
+                            psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
+                            psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
+                        }
+                    }
+                }
+            }
+#endif
+
+            // Write results
+            float4 of;
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset         = x + tgx + warp*cSim.stride;
+            of                          = cSim.pForce4[offset];
+            of.x                       += af.x;
+            of.y                       += af.y;
+            of.z                       += af.z;
+            cSim.pForce4[offset]        = of;
+            offset                      = y + tgx + warp*cSim.stride;
+            of                          = cSim.pForce4[offset];
+            of.x                       += sA[threadIdx.x].fx;
+            of.y                       += sA[threadIdx.x].fy;
+            of.z                       += sA[threadIdx.x].fz;
+            cSim.pForce4[offset]        = of;
+#else
+            unsigned int offset         = x + tgx + (y >> GRIDBITS) * cSim.stride;
+            of                          = cSim.pForce4[offset];
+            of.x                       += af.x;
+            of.y                       += af.y;
+            of.z                       += af.z;
+            of.w                        = 0.0f;
+            cSim.pForce4[offset]        = of;
+
+            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
+            of                          = cSim.pForce4[offset];
+            of.x                       += sA[threadIdx.x].fx;
+            of.y                       += sA[threadIdx.x].fy;
+            of.z                       += sA[threadIdx.x].fz;
+            cSim.pForce4[offset]        = of;
+#endif
+        }
+        lasty = y;
+        pos++;
+    }
+}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateNonbondedSoftcore.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateNonbondedSoftcore.cu
@@ -395,3 +395,85 @@ fprintf( stderr, "kCalculateCDLJSoftcoreForces: bOutputBufferPerWarp=%u blks=%u
 #endif
    }
 }
+
+void kPrintForces(gpuContext gpu, std::string idString, int call )
+{
+ //   printf("kReduceForces\n");
+#define GBVI_DEBUG 4
+#if ( GBVI_DEBUG == 4 )
+
+                gpu->psBornRadii->Download();
+                gpu->psObcData->Download();
+                gpu->psObcChain->Download();
+                gpu->psBornForce->Download();
+                gpu->psForce4->Download();
+                gpu->psPosq4->Download();
+                int maxPrint = 0; 
+int   nanHit       = 0;
+int   targetIndex  = 852;
+float maxForce     = 3.0e+04;
+float maxPosition  = 2.0e+02;
+                for( int ii = 0; ii < gpu->natoms; ii++ ){
+
+int   hit  = 0;
+float dist = sqrtf( gpu->psPosq4->_pSysStream[0][ii].x*gpu->psPosq4->_pSysStream[0][ii].x + 
+                    gpu->psPosq4->_pSysStream[0][ii].y*gpu->psPosq4->_pSysStream[0][ii].y +
+                    gpu->psPosq4->_pSysStream[0][ii].z*gpu->psPosq4->_pSysStream[0][ii].z );
+
+if( fabs( gpu->psForce4->_pSysStream[0][ii].x ) > maxForce ||
+    fabs( gpu->psForce4->_pSysStream[0][ii].y ) > maxForce ||
+    fabs( gpu->psForce4->_pSysStream[0][ii].z ) > maxForce ||
+//    gpu->psBornRadii->_pSysStream[0][ii] <= 0.0            ||
+    dist > maxPosition                                     ||
+    isnan( gpu->psForce4->_pSysStream[0][ii].x )           ||
+    isnan( gpu->psForce4->_pSysStream[0][ii].y )           ||
+    isnan( gpu->psForce4->_pSysStream[0][ii].z )  ){  
+   hit = 1;
+} else {
+   hit = 0;
+}
+if( ii == targetIndex || ii == (targetIndex+1) || ii == (targetIndex+2) )hit = 1;
+if( isnan( gpu->psBornForce->_pSysStream[0][ii] ) ||
+    isnan( gpu->psBornRadii->_pSysStream[0][ii] ) ||
+    isnan( gpu->psObcChain->_pSysStream[0][ii]  ) ||
+    isnan( gpu->psForce4->_pSysStream[0][ii].x  )  ||
+    isnan( gpu->psForce4->_pSysStream[0][ii].y  )  ||
+    isnan( gpu->psForce4->_pSysStream[0][ii].z  )  ){  
+   hit    = 1;
+   nanHit = 1;
+}
+
+                //if( hit || ii < maxPrint || ii >= (gpu->natoms - maxPrint) )
+                if( hit ){
+                    static int firstHit = 1;
+                    if( firstHit ){
+                       firstHit = 0;
+                       (void) fprintf( stderr, "\nkPrintForces: %d [r, scl q] b[r/c/f] f[] x[] Born radii/force (%p %p)\n", call,
+                                       gpu->psBornForce, gpu->psBornForce->_pDevStream[0] );
+                    }
+                    (void) fprintf( stderr, "%6d [%8.3f %8.3f %8.3f] b[%13.6e %13.6e %13.6e] f[%13.6e %13.6e %13.6e] x[%13.6e %13.6e %13.6e] %10.3e %s %s %d\n",
+                                    ii, 
+                                    (gpu->psObcData->_pSysStream[0][ii].x + 0.009f),
+                                    (gpu->psObcData->_pSysStream[0][ii].y/gpu->psObcData->_pSysStream[0][ii].x),
+                                    gpu->psPosq4->_pSysStream[0][ii].w,
+                                    gpu->psBornRadii->_pSysStream[0][ii],
+                                    gpu->psObcChain->_pSysStream[0][ii],
+                                    gpu->psBornForce->_pSysStream[0][ii],
+
+                                    gpu->psForce4->_pSysStream[0][ii].x,
+                                    gpu->psForce4->_pSysStream[0][ii].y,
+                                    gpu->psForce4->_pSysStream[0][ii].z,
+
+                                    gpu->psPosq4->_pSysStream[0][ii].x,
+                                    gpu->psPosq4->_pSysStream[0][ii].y,
+                                    gpu->psPosq4->_pSysStream[0][ii].z, dist,
+
+                                    (hit ? "XXXXXXX" : "" ), idString.c_str(), call );
+                }
+                }
+                (void) fflush( stderr );
+//                if( nanHit )exit(0);
+#endif
+
+}
+
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreForces2.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreForces2.h
@@ -135,12 +135,12 @@ __global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, Forces2_kernel)(unsigned
 #else
            unsigned int offset         = x + tgx + (x >> GRIDBITS) * cSim.stride;
 #endif
-            of                          = cSim.pForce4b[offset];
+            of                          = cSim.pForce4[offset];
            of.x                       += af.x + sA[threadIdx.x].fx;
            of.y                       += af.y + sA[threadIdx.x].fy;
            of.z                       += af.z + sA[threadIdx.x].fz;
            of.w                        = 0.0f;
-            cSim.pForce4b[offset]       = of;
+            cSim.pForce4[offset]       = of;
        }
        else
        {
@@ -400,22 +400,22 @@ __global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, Forces2_kernel)(unsigned
 #else
            unsigned int offset         = x + tgx + (y >> GRIDBITS) * cSim.stride;
 #endif
-            of                          = cSim.pForce4b[offset];
+            of                          = cSim.pForce4[offset];
            of.x                       += af.x;
            of.y                       += af.y;
            of.z                       += af.z;
-            cSim.pForce4b[offset]       = of;
+            cSim.pForce4[offset]        = of;

 #ifdef USE_OUTPUT_BUFFER_PER_WARP
            offset                      = y + tgx + warp*cSim.stride;
 #else
            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
 #endif
-            of                          = cSim.pForce4b[offset];
+            of                          = cSim.pForce4[offset];
            of.x                       += sA[threadIdx.x].fx;
            of.y                       += sA[threadIdx.x].fy;
            of.z                       += sA[threadIdx.x].fz;
-            cSim.pForce4b[offset]       = of;
+            cSim.pForce4[offset]        = of;
        }
        lasty = y;
        pos++;