Extened Free energy plugin to allow cutoffs; cleaned up code and added tests

bc85b9f0 · Mark Friedrichs · d4441c15 · bc85b9f0 · d4441c15 · bc85b9f0
Commit bc85b9f0 authored Oct 24, 2011 by Mark Friedrichs
20 changed files
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaSoftcoreForces1.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaSoftcoreForces1.h
@@ -30,17 +30,26 @@
 * different versions of the kernels.
 */

+#define USE_SOFTCORE_LJ
+
 #ifdef USE_SOFTCORE_LJ
 #include "kSoftcoreLJ.h"
 #endif

-/* Cuda compiler on Windows does not recognized "static const float" values */
-#define LOCAL_HACK_PI 3.1415926535897932384626433832795
-
-#define COULOMB_ON
+#undef TARGET
+//#define TARGET 0

-__global__ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* workUnit)
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
+#endif
+void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* workUnit )
 {
+//void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* workUnit, float4* pdE1, float4* pdE2 )
    extern __shared__ Atom sA[];
    unsigned int totalWarps        = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
    unsigned int warp              = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
@@ -53,10 +62,6 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsig
    float* tempBuffer              = (float*) &sA[cSim.nonbond_threads_per_block];
 #endif

-#ifdef USE_EWALD
-    const float TWO_OVER_SQRT_PI = 2.0f/sqrt(LOCAL_HACK_PI);
-#endif
-
    unsigned int lasty             = -0xFFFFFFFF;
    while (pos < end)
    {
@@ -69,30 +74,36 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsig
        unsigned int tgx                    = threadIdx.x & (GRID - 1);
        unsigned int i                      = x + tgx;
        float4 apos                         = cSim.pPosq[i];
-        float2 a                            = cSim.pAttr[i];
-        float  softCoreLJLambda             = feSimDev.pParticleSoftCoreLJLambda[i];
+        float4 a                            = feSimDev.pSigEps4[i];
+        float  softCoreLJLambda             = a.z;
        float br                            = cSim.pBornRadii[i];
        unsigned int tbx                    = threadIdx.x - tgx;
        unsigned int tj                     = tgx;
        Atom* psA                           = &sA[tbx];
+
        float4 af;
        af.x                                = 0.0f;
        af.y                                = 0.0f;
        af.z                                = 0.0f;
        af.w                                = 0.0f;
+
        if (x == y) // Handle diagonals uniquely at 50% efficiency
        {
            // Read fixed atom data into registers and GRF
+
            sA[threadIdx.x].x                    = apos.x;
            sA[threadIdx.x].y                    = apos.y;
            sA[threadIdx.x].z                    = apos.z;
-            sA[threadIdx.x].q                    = apos.w;
-            float q2                             = cSim.preFactor * apos.w;
-            apos.w                              *= cSim.epsfac;
+
+            sA[threadIdx.x].q                    = a.w;
            sA[threadIdx.x].sig                  = a.x;
            sA[threadIdx.x].eps                  = a.y;
            sA[threadIdx.x].br                   = br;
            sA[threadIdx.x].softCoreLJLambda     = softCoreLJLambda;
+
+            float q2                             = cSim.preFactor*a.w;
+            a.w                                 *= cSim.epsfac;
+
            if (!bExclusionFlag)
            {
                for (unsigned int j = 0; j < GRID; j++)
@@ -118,29 +129,16 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsig
                    sig2                   *= sig2;
                    float sig6              = sig2 * sig2 * sig2;
                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
-                    /* E */ 
 		              CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
 #endif
 #ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                    float r                 = sqrt(r2);
-                    float alphaR            = cSim.alphaEwald * r;
-                    float erfcAlphaR        = fastErfc(alphaR);
-                    dEdR                   += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-		    /* E */
-                    CDLJObcGbsa_energy     += apos.w * psA[j].q * invR * erfcAlphaR;
-    #else
-                    dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-                    /* E */
-                    CDLJObcGbsa_energy     += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-    #endif
+                    dEdR                   += a.w * psA[j].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
+                    CDLJObcGbsa_energy     += a.w * psA[j].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
 #else

-#ifdef COULOMB_ON
-                    float factorX           = apos.w * psA[j].q * invR;
+                    float factorX           = a.w * psA[j].q * invR;
                    dEdR                   += factorX;
                    CDLJObcGbsa_energy     += factorX;
-#endif

 #endif
                    dEdR                   *= invR * invR;
@@ -153,37 +151,57 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsig
                    float denominator       = sqrt(denominator2);
                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
-                    af.w                   += dGpol_dalpha2_ij * psA[j].br;
                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
-		    /* E */
 		              CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
 #ifdef USE_CUTOFF
-                    if (r2 > cSim.nonbondedCutoffSqr)
+                    if ( i >= cSim.atoms || (x+j) >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+#else
+                    if ( i >= cSim.atoms || (x+j) >= cSim.atoms)
+#endif
                    {
                        dEdR                = 0.0f;
- 			/* E */
 			               CDLJObcGbsa_energy  = 0.0f;
+                        dGpol_dalpha2_ij    = 0.0f;
                    }
-#endif
-		    /* E */
-                    if (i < cSim.atoms)
-                    {
+                    af.w                   += dGpol_dalpha2_ij * psA[j].br;
+
+
+/*
+int jIdx = j;
+if( i == TARGET ){
+int tjj     = y+jIdx;
+pdE1[tjj].x = dGpol_dalpha2_ij * psA[j].br;
+pdE1[tjj].y = sqrt(r2);
+pdE1[tjj].z =  (q2 * psA[j].q) / denominator;
+pdE1[tjj].w = 1.0f;
+}
+
+if( (y+jIdx) == TARGET ){
+int tjj     = i;
+pdE1[tjj].x = dEdR - Gpol * (1.0f - 0.25f * expTerm);
+pdE1[tjj].y = r2;
+pdE1[tjj].z = CDLJObcGbsa_energy - (q2 * psA[jIdx].q) / denominator;
+pdE1[tjj].w = -1.0f;
+} */
                    energy                 += 0.5f*CDLJObcGbsa_energy;
-                    }
+
                    // Add Forces
+
                    dx                     *= dEdR;
                    dy                     *= dEdR;
                    dz                     *= dEdR;
+
                    af.x                   -= dx;
                    af.y                   -= dy;
                    af.z                   -= dz;
                }
-            }
-            else  // bExclusion
-            {
+
+            } else {
+
                unsigned int xi   = x>>GRIDBITS;
                unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
+
                for (unsigned int j = 0; j < GRID; j++)
                {
                    float dx                = psA[j].x - apos.x;
@@ -198,60 +216,42 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsig
                    float invR              = 1.0f / sqrt(r2);
                    float sig               = a.x + psA[j].sig;
                    float eps               = a.y * psA[j].eps;
+
 #ifdef USE_SOFTCORE_LJ
                    float dEdR              = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJObcGbsa_energy );
+                    //float dEdR              = getSoftCoreLJMod( (invR*sig), eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJObcGbsa_energy );
 #else

                    // CDLJ part
+
                    float sig2              = invR * sig;
                    sig2                   *= sig2;
                    float sig6              = sig2 * sig2 * sig2;
                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
-                    /* E */ 
 		              CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
 #endif
-#ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                    float r                 = sqrt(r2);
-                    float alphaR            = cSim.alphaEwald * r;
-                    float erfcAlphaR        = fastErfc(alphaR);
-                    dEdR                   += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-		    /* E */
-                    CDLJObcGbsa_energy     += apos.w * psA[j].q * invR * erfcAlphaR;
-                    bool needCorrection = !(excl & 0x1) && x+tgx != y+j && x+tgx < cSim.atoms && y+j < cSim.atoms;
-                    if (needCorrection)
-                    {
-                        // Subtract off the part of this interaction that was included in the reciprocal space contribution.

-                        dEdR               = -apos.w * psA[j].q * invR * ((1.0f-erfcAlphaR) - alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-                        CDLJObcGbsa_energy = -apos.w * psA[j].q * invR * (1.0f-erfcAlphaR);
-                    }
-    #else
-                    dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-                    /* E */
-                    CDLJObcGbsa_energy     += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-    #endif
+#ifdef USE_CUTOFF
+                    dEdR                   += a.w * psA[j].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
+                    CDLJObcGbsa_energy     += a.w * psA[j].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
 #else

-#ifdef COULOMB_ON
-                    float factorX           = apos.w * psA[j].q * invR;
+                    float factorX           = a.w * psA[j].q * invR;
                    dEdR                   += factorX;
                    CDLJObcGbsa_energy     += factorX;
-#endif

 #endif
                    dEdR                   *= invR * invR;
-#ifdef USE_EWALD
-                    if (!(excl & 0x1) && !needCorrection)
-#else
+
                    if (!(excl & 0x1))
-#endif
                    {
                        dEdR                = 0.0f;
                        CDLJObcGbsa_energy  = 0.0f;
                    }
+//float dEdRx = dEdR;

                    // ObcGbsaForce1 part
+
                    float alpha2_ij         = br * psA[j].br;
                    float D_ij              = r2 / (4.0f * alpha2_ij);
                    float expTerm           = exp(-D_ij);
@@ -259,34 +259,54 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsig
                    float denominator       = sqrt(denominator2);
                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
-                    af.w                   += dGpol_dalpha2_ij * psA[j].br;
                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
-                    /* E */
                    CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
-#if defined USE_PERIODIC
+
+#if defined USE_CUTOFF
                    if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+#else
+                    if (i >= cSim.atoms || x+j >= cSim.atoms )
+#endif
                    {
                        dEdR               = 0.0f;
 		                  CDLJObcGbsa_energy = 0.0f;
-                    }
-#elif defined USE_CUTOFF
-                    if (r2 > cSim.nonbondedCutoffSqr)
-                    {
-                        dEdR               = 0.0f;
-		                  CDLJObcGbsa_energy = 0.0f;
-                    }
-#endif
-                    if (i < cSim.atoms)
-                    {
+                        dGpol_dalpha2_ij   = 0.0f;
+//dEdRx = 0.0f;
+                    }
+
+/*
+int jIdx    = j;
+if( i == TARGET ){
+int tjj     =  (y+jIdx);
+pdE1[tjj].x = dGpol_dalpha2_ij * psA[j].br;
+pdE1[tjj].y = sqrt(r2);
+pdE1[tjj].z =  (q2 * psA[j].q) / denominator;
+pdE1[tjj].x = dEdRx;
+pdE1[tjj].y = sqrt(r2);
+pdE1[tjj].z = CDLJObcGbsa_energy - (q2 * psA[jIdx].q) / denominator;
+pdE1[tjj].w = 2.0f;
+}
+if( (y+jIdx) == TARGET ){
+int tjj     = i;
+pdE1[tjj].x = dGpol_dalpha2_ij * psA[j].br;
+pdE1[tjj].y = sqrt(r2);
+pdE1[tjj].z =  (q2 * psA[j].q) / denominator;
+pdE1[tjj].w = -2.0f;
+} */
+
+                    af.w                  += dGpol_dalpha2_ij * psA[j].br;
                    energy                += 0.5f*CDLJObcGbsa_energy;
-                    }
+                     
                    // Add Forces
+
                    dx                     *= dEdR;
                    dy                     *= dEdR;
                    dz                     *= dEdR;
+
                    af.x                   -= dx;
                    af.y                   -= dy;
                    af.z                   -= dz;
+
                    excl                  >>= 1;
                }
            }
@@ -307,32 +327,32 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsig
            cSim.pForce4[offset]        = of;
            cSim.pBornForce[offset]     = bf;

-        }
-        else        // 100% utilization
-        {
+        } else { 
+
            // Read fixed atom data into registers and GRF
+
            if (lasty != y)
            {
                unsigned int j                       = y + tgx;
                float4 temp                          = cSim.pPosq[j];
-                float2 temp1                         = cSim.pAttr[j];
-                float  temp2                         = feSimDev.pParticleSoftCoreLJLambda[j];
-//float  temp2 = 1.0f;
+                float4 temp1                         = feSimDev.pSigEps4[j];
                sA[threadIdx.x].br                   = cSim.pBornRadii[j];
                sA[threadIdx.x].x                    = temp.x;
                sA[threadIdx.x].y                    = temp.y;
                sA[threadIdx.x].z                    = temp.z;
-                sA[threadIdx.x].q                    = temp.w;
+                sA[threadIdx.x].q                    = temp1.w;
                sA[threadIdx.x].sig                  = temp1.x;
                sA[threadIdx.x].eps                  = temp1.y;
-                sA[threadIdx.x].softCoreLJLambda     = temp2;
+                sA[threadIdx.x].softCoreLJLambda     = temp1.z;
            }
+
            sA[threadIdx.x].fx          = 0.0f;
            sA[threadIdx.x].fy          = 0.0f;
            sA[threadIdx.x].fz          = 0.0f;
            sA[threadIdx.x].fb          = 0.0f;
-            float q2                    = apos.w * cSim.preFactor;
-            apos.w                     *= cSim.epsfac;
+
+            float q2                    = a.w * cSim.preFactor;
+            a.w                        *= cSim.epsfac;
            if (!bExclusionFlag)
            {
 #ifdef USE_CUTOFF
@@ -369,33 +389,22 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsig
                        sig2                   *= sig2;
                        float sig6              = sig2 * sig2 * sig2;
                        float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
-                        /* E */ 
                        CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
 #endif
 #ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                        float r                 = sqrt(r2);
-                        float alphaR            = cSim.alphaEwald * r;
-                        float erfcAlphaR        = fastErfc(alphaR);
-                        dEdR                   += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-                        /* E */
-                        CDLJObcGbsa_energy     += apos.w * psA[tj].q * invR * erfcAlphaR;
-    #else
-                        dEdR                   += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-                        /* E */
-                        CDLJObcGbsa_energy     += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-    #endif
+                        dEdR                   += a.w * psA[tj].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
+                        CDLJObcGbsa_energy     += a.w * psA[tj].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
 #else

-#ifdef COULOMB_ON
-                        float factorX           = apos.w * psA[tj].q * invR;
+                        float factorX           = a.w * psA[tj].q * invR;
                        dEdR                   += factorX;
                        CDLJObcGbsa_energy     += factorX;
-#endif
 #endif
                        dEdR                   *= invR * invR;
+//float dEdRx = dEdR;

                        // ObcGbsaForce1 part
+
                        float alpha2_ij         = br * psA[tj].br;
                        float D_ij              = r2 / (4.0f * alpha2_ij);
                        float expTerm           = exp(-D_ij);
@@ -403,38 +412,64 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsig
                        float denominator       = sqrt(denominator2);
                        float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
                        float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
-                        af.w                   += dGpol_dalpha2_ij * psA[tj].br;
-                        psA[tj].fb             += dGpol_dalpha2_ij * br;
                        dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
-                        /* E */
                        CDLJObcGbsa_energy     += (q2 * psA[tj].q) / denominator;
 #ifdef USE_CUTOFF
-                        if (r2 > cSim.nonbondedCutoffSqr)
+                        if ( i >= cSim.atoms || (y+tj) >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+#else
+                        if ( i >= cSim.atoms || (y+tj) >= cSim.atoms)
+#endif
                        {
                            dEdR               = 0.0f;
       			             CDLJObcGbsa_energy = 0.0f;
+                            dGpol_dalpha2_ij   = 0.0f;
+//dEdRx = 0.0f;
                        }
-#endif
-                        if (i < cSim.atoms)
-                        {
+                        psA[tj].fb             += dGpol_dalpha2_ij * br;
+                        af.w                   += dGpol_dalpha2_ij * psA[tj].br;
                        energy                 += CDLJObcGbsa_energy;
-                        }
+
+
+
+/*
+int jIdx = tj;
+if( i == TARGET ){
+int tjj     = y+jIdx;
+pdE1[tjj].x = dEdRx;
+pdE1[tjj].y = sqrt(r2);
+pdE1[tjj].z = -dEdRx*dz;
+pdE1[tjj].w = 3.0f;
+}
+if( (y+jIdx) == TARGET ){
+int tjj     = i;
+pdE1[tjj].x = dEdRx;
+pdE1[tjj].y = sqrt(r2);
+pdE1[tjj].z = dEdRx*dz;
+pdE1[tjj].w = -3.0f;
+} */
+
+
+
                        // Add forces
+
                        dx                     *= dEdR;
                        dy                     *= dEdR;
                        dz                     *= dEdR;
+
                        af.x                   -= dx;
                        af.y                   -= dy;
                        af.z                   -= dz;
+
                        psA[tj].fx             += dx;
                        psA[tj].fy             += dy;
                        psA[tj].fz             += dz;
+
                        tj                      = (tj + 1) & (GRID - 1);
                    }
                }
 #ifdef USE_CUTOFF
-                else
-                {
+                else {
+
                    // Compute only a subset of the interactions in this block.

                    for (unsigned int j = 0; j < GRID; j++)
@@ -462,28 +497,16 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsig
                            sig2                   *= sig2;
                            float sig6              = sig2 * sig2 * sig2;
                            float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
-                            /* E */ 
                            CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
 #endif
 #ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                            float r                 = sqrt(r2);
-                            float alphaR            = cSim.alphaEwald * r;
-                            float erfcAlphaR        = fastErfc(alphaR);
-                            dEdR                   += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-                            CDLJObcGbsa_energy     += apos.w * psA[j].q * invR * erfcAlphaR;
-    #else
-                            dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-                            /* E */
-                            CDLJObcGbsa_energy     += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-    #endif
+                            dEdR                   += a.w * psA[j].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
+                            CDLJObcGbsa_energy     += a.w * psA[j].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
 #else

-#ifdef COULOMB_ON
-                            float factorX           = apos.w * psA[j].q * invR;
+                            float factorX           = a.w * psA[j].q * invR;
                            dEdR                   += factorX;
                            CDLJObcGbsa_energy     += factorX;
-#endif
 #endif
                            dEdR                   *= invR * invR;

@@ -495,11 +518,21 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsig
                            float denominator       = sqrt(denominator2);
                            float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
                            float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
-                            af.w                   += dGpol_dalpha2_ij * psA[j].br;
                            dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
-                            /* E */
                            CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;

+#ifdef USE_CUTOFF
+                            if ( i >= cSim.atoms || (y+j) >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+#else
+                            if ( i >= cSim.atoms || (y+j) >= cSim.atoms)
+#endif
+                            {
+                                dEdR                = 0.0f;
+				                    CDLJObcGbsa_energy  = 0.0f;
+                                dGpol_dalpha2_ij    = 0.0f;
+                            }
+                            af.w                   += dGpol_dalpha2_ij * psA[j].br;
+
                            // Sum the Born forces.

                            tempBuffer[threadIdx.x] = dGpol_dalpha2_ij * br;
@@ -513,17 +546,28 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsig
                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
                            if (tgx == 0)
                                psA[j].fb += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
-#ifdef USE_CUTOFF
-                            if (r2 > cSim.nonbondedCutoffSqr)
-                            {
-                                dEdR                = 0.0f;
-				                    CDLJObcGbsa_energy  = 0.0f;
-                            }
-#endif
-                            if (i < cSim.atoms)
-                            {
+
+/*
+int jIdx = j;
+if( i == TARGET ){
+int tjj     = y+jIdx;
+pdE1[tjj].x = dEdR;
+pdE1[tjj].y = r2;
+pdE1[tjj].z = CDLJObcGbsa_energy - (q2 * psA[jIdx].q) / denominator;
+pdE1[tjj].w = -4.7f;
+}
+if( (y+jIdx) == TARGET ){
+int tjj     = i;
+pdE1[tjj].x = dEdR;
+pdE1[tjj].y = r2;
+pdE1[tjj].z = CDLJObcGbsa_energy - (q2 * psA[jIdx].q) / denominator;
+pdE1[tjj].w = -4.7f;
+} */
+
+
+
                            energy                 += CDLJObcGbsa_energy;
-                            }
+
                            // Add forces
                            dx                     *= dEdR;
                            dy                     *= dEdR;
@@ -568,9 +612,8 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsig
                    }
                }
 #endif
-            }
-            else  // bExclusion
-            {
+            } else  {
+
                unsigned int xi   = x>>GRIDBITS;
                unsigned int yi   = y>>GRIDBITS;
                unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
@@ -599,49 +642,25 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsig
                    sig2                   *= sig2;
                    float sig6              = sig2 * sig2 * sig2;
                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
-                    /* E */ 
 		              CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
 #endif
 #ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                    float r                 = sqrt(r2);
-                    float alphaR            = cSim.alphaEwald * r;
-                    float erfcAlphaR        = fastErfc(alphaR);
-                    dEdR                   += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-                    /* E */
-                    CDLJObcGbsa_energy     += apos.w * psA[tj].q * invR * erfcAlphaR;
-                    bool needCorrection = !(excl & 0x1) && x+tgx != y+tj && x+tgx < cSim.atoms && y+tj < cSim.atoms;
-                    if (needCorrection)
-                    {
-                        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
-
-                        dEdR               = -apos.w * psA[tj].q * invR * ((1.0f-erfcAlphaR) - alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-                        CDLJObcGbsa_energy = -apos.w * psA[tj].q * invR * (1.0f-erfcAlphaR);
-                    }
-    #else
-                    dEdR                   += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-                    /* E */
-                    CDLJObcGbsa_energy     += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-    #endif
+                    dEdR                   += a.w * psA[tj].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
+                    CDLJObcGbsa_energy     += a.w * psA[tj].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
 #else
-#ifdef COULOMB_ON
-                    float factorX           = apos.w * psA[tj].q * invR;
+                    float factorX           = a.w * psA[tj].q * invR;
                    dEdR                   += factorX;
                    CDLJObcGbsa_energy     += factorX;
-#endif
 #endif
                    dEdR                   *= invR * invR;
-#ifdef USE_EWALD
-                    if (!(excl & 0x1) && !needCorrection)
-#else
                    if (!(excl & 0x1))
-#endif
                    {
                        dEdR               = 0.0f;
 			               CDLJObcGbsa_energy = 0.0f;
                    }
-
+//float dEdRx = dEdR;
                    // ObcGbsaForce1 part
+
                    float alpha2_ij         = br * psA[tj].br;
                    float D_ij              = r2 / (4.0f * alpha2_ij);
                    float expTerm           = exp(-D_ij);
@@ -649,37 +668,57 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsig
                    float denominator       = sqrt(denominator2);
                    float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
-                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;
-                    psA[tj].fb             += dGpol_dalpha2_ij * br;
                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
 		              CDLJObcGbsa_energy     += (q2 * psA[tj].q) / denominator;
-#if defined USE_PERIODIC
+#if defined USE_CUTOFF
                    if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+#else
+                    if (i >= cSim.atoms || y+tj >= cSim.atoms)
+#endif
                    {
                        dEdR               = 0.0f;
 			               CDLJObcGbsa_energy = 0.0f;
-                    }
-#elif defined USE_CUTOFF
-                    if (r2 > cSim.nonbondedCutoffSqr)
-                    {
-                        dEdR = 0.0f;
-			               CDLJObcGbsa_energy = 0.0f;
-                    }
-#endif
-                    if (i < cSim.atoms)
-                    {
+                        dGpol_dalpha2_ij   = 0.0f;
+//dEdRx = 0.0;
+                    }
+
+/*
+int jIdx = tj;
+if( i == TARGET ){
+int tjj     = y+jIdx;
+pdE1[tjj].x = dEdRx;
+pdE1[tjj].y = sqrt(r2);
+pdE1[tjj].z = dEdRx*dz;
+pdE1[tjj].w = 6.0f;
+}
+if( (y+jIdx) == TARGET ){
+int tjj     = i;
+pdE1[tjj].x = dEdRx;
+pdE1[tjj].y = sqrt(r2);
+pdE1[tjj].z = dEdRx*dz;
+pdE1[tjj].w = -6.0f;
+}  */
+
+
+
+                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;
+                    psA[tj].fb             += dGpol_dalpha2_ij * br;
                    energy                 += CDLJObcGbsa_energy;
-                    }
+
                    // Add forces
+
                    dx                     *= dEdR;
                    dy                     *= dEdR;
                    dz                     *= dEdR;
+
                    af.x                   -= dx;
                    af.y                   -= dy;
                    af.z                   -= dz;
+
                    psA[tj].fx             += dx;
                    psA[tj].fy             += dy;
                    psA[tj].fz             += dz;
+
                    excl                  >>= 1;
                    tj                      = (tj + 1) & (GRID - 1);
                }
@@ -714,40 +753,6 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsig
            cSim.pForce4[offset]        = of;
            cSim.pBornForce[offset]     = bf;

-#if 0
-
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset         = x + tgx + warp*cSim.stride;
-            float4 of                   = cSim.pForce4[offset];
-            of.x                       += af.x;
-            of.y                       += af.y;
-            of.z                       += af.z;
-            of.w                       += af.w;
-            cSim.pForce4[offset]        = of; 
-            cSim.pBornForce[offset]     = af.w;
-            offset                      = y + tgx + warp*cSim.stride;
-            of                          = cSim.pForce4[offset];
-            of.x                       += sA[threadIdx.x].fx;
-            of.y                       += sA[threadIdx.x].fy;
-            of.z                       += sA[threadIdx.x].fz;
-            of.w                       += sA[threadIdx.x].fb;
-            cSim.pForce4[offset]        = of; 
-            cSim.pBornForce[offset]     = af.w;
-#else
-            unsigned int offset         = x + tgx + (y >> GRIDBITS) * cSim.stride;
-            cSim.pForce4[offset]        = af; 
-            cSim.pBornForce[offset]     = af.w;
-            af.x                        = sA[threadIdx.x].fx;
-            af.y                        = sA[threadIdx.x].fy;
-            af.z                        = sA[threadIdx.x].fz;
-            af.w                        = sA[threadIdx.x].fb;
-            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pForce4[offset]        = af; 
-            cSim.pBornForce[offset]     = af.w;
-#endif
-
-#endif
-
            lasty                       = y;
        }
        pos++;

--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreAux.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreAux.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Mark Friedrichs                                                   *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-
-#ifndef __Gpu_GBVI_SOFTCORE_AUX_H__
-#define __Gpu_GBVI_SOFTCORE_AUX_H__
-
-/**
- * This file contains subroutines used in evaluating quantities associated w/ the GB/VI function
- */
-
-__device__ float getGBVI_L( float r, float x, float S )
-{
-
-   float rInv   = 1.0f/r;
-
-   float xInv   = 1.0f/x;
-   float xInv2  = xInv*xInv;
-   float diff2  = (r + S)*(r - S);
-
-   return (1.5f*xInv2)*( (0.25f*rInv) - (xInv/3.0f) + (0.125f*diff2*xInv2*rInv) );
-}
-
-__device__ float getGBVI_Volume( float r_ij, float R, float S )
-{
-
-     float upperBound        = r_ij + S; 
-     float rdiffS            = r_ij - S; 
-     float lowerBound        = R > rdiffS ? R : rdiffS;
-     float L_upper           = getGBVI_L( r_ij, upperBound, S );
-     float L_lower           = getGBVI_L( r_ij, lowerBound, S );
-     float mask              = r_ij < (R - S) ? 0.0f : 1.0f;  
-     float addOn             = r_ij < (S - R) ? (1.0f/(R*R*R)) : 0.0f;  
-
-     return (mask*( L_upper - L_lower ) + addOn);
-
-}
-
-__device__ float getGBVI_dL_dr( float r, float x, float S )
-{
-
-   float rInv   = 1.0f/r;
-   float rInv2  = rInv*rInv;
-
-   float xInv   = 1.0f/x;
-   float xInv2  = xInv*xInv;
-   float xInv3  = xInv2*xInv;
-   
-   float diff2  = (r + S)*(r - S);
-
-   return ( (-1.5f*xInv2*rInv2)*( 0.25f + 0.125f*diff2*xInv2 ) + 0.375f*xInv3*xInv );
-   //return 0.0f;
-
-}
-
-__device__ float getGBVI_dL_dx( float r, float x, float S )
-{
-
-   float rInv   = 1.0f/r;
-
-   float xInv   = 1.0f/x;
-   float xInv2  = xInv*xInv;
-   float xInv3  = xInv2*xInv;
-
-   float diff   = (r + S)*(r - S);
-
-   return ( (-1.5f*xInv3)*( (0.5f*rInv) - xInv + (0.5f*diff*xInv2*rInv) ));
-
-
-}
-
-__device__ float getGBVI_dE2( float r, float R, float S, float bornForce )
-{
-
-    float diff              = S - R;
-    float absDiff           = fabsf( S - R );
-    float dE                = getGBVI_dL_dr( r, r+S, S ) + getGBVI_dL_dx( r, r+S, S );
-    float mask;
-    float lowerBound;
-    if( (R > (r - S)) && (absDiff < r) ){
-        mask       = 0.0f;
-        lowerBound = R;
-    } else {
-        mask       = 1.0f;
-        lowerBound = (r - S);  
-    }   
-    dE                    -= getGBVI_dL_dr( r, lowerBound, S ) + mask*getGBVI_dL_dx( r, lowerBound, S );
-    dE                     = (absDiff >= r) && r >= diff ? 0.0f : dE; 
-    dE                    *= ( (r > 1.0e-08f) ? (bornForce/r) : 0.0f);
-
-    return (-dE);
-
-}
-
-__device__ float getGBVIBornForce2( float bornRadius, float R, float bornForce, float gamma )
-{ 
-    float ratio                     = (R/bornRadius);
-    float returnBornForce           = bornForce + (3.0f*gamma*ratio*ratio*ratio)/bornRadius; // 'cavity' term
-    float br2                       = bornRadius*bornRadius;
-          returnBornForce          *= (1.0f/3.0f)*br2*br2;
-
-   return returnBornForce;
-
-}
-
-#endif // __Gpu_GBVI_SOFTCORE_AUX_H__
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.cu
@@ -29,133 +29,120 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

-#include "GpuGBVISoftcore.h"
 #include "GpuFreeEnergyCudaKernels.h"
+#include "freeEnergyGpuTypes.h"
+
+#include "openmm/OpenMMException.h"
 #include <cuda.h>
+#include <sstream>

-struct cudaFreeEnergySimulationGBVI {
-    float quinticLowerLimitFactor;
-    float quinticUpperLimit;
-    float* pSwitchDerivative;
-};
-struct cudaFreeEnergySimulationGBVI gbviSim;
+#define PARAMETER_PRINT 0
+#define MAX_PARAMETER_PRINT 10
+//#define DEBUG

 static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaFreeEnergySimulationGBVI gbviSimDev;
+static __constant__ cudaFreeEnergyGmxSimulation gbviSimDev;

-void SetCalculateGBVISoftcoreBornSumGpuSim(gpuContext gpu)
+void SetCalculateGBVISoftcoreBornSumGpuSim( freeEnergyGpuContext freeEnergyGpu)
 {
    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    status = cudaMemcpyToSymbol( cSim, &freeEnergyGpu->gpuContext->sim, sizeof(cudaGmxSimulation));    
    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateGBVISoftcoreBornSumGpuSim copy to cSim failed");
-    //(void) fprintf( stderr, "SetCalculateGBVISoftcoreBornSumGpuSim\n" );
-}

-void GetCalculateGBVISoftcoreBornSumSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-
-void SetCalculateGBVISoftcoreSupplementarySim( GpuGBVISoftcore* gpuGBVISoftcore )
-{
-    cudaError_t status;
-    gbviSim.pSwitchDerivative        = gpuGBVISoftcore->getGpuSwitchDerivative();
-    gbviSim.quinticLowerLimitFactor  = gpuGBVISoftcore->getQuinticLowerLimitFactor();
-    gbviSim.quinticUpperLimit        = gpuGBVISoftcore->getQuinticUpperLimit();
-    status                           = cudaMemcpyToSymbol(gbviSimDev, &gbviSim, sizeof(cudaFreeEnergySimulationGBVI));
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateGBVISoftcoreSupplementarySim");
-
-    //(void) fprintf( stderr, "SetCalculateGBVISoftcoreSupplementarySim %14.6e %14.6e swDerv=%p\n",
-    //                gbviSim.quinticLowerLimitFactor, gbviSim.quinticUpperLimit, gbviSim.pSwitchDerivative );
+    status = cudaMemcpyToSymbol( gbviSimDev, &freeEnergyGpu->freeEnergySim, sizeof(cudaFreeEnergyGmxSimulation));    
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateGBVISoftcoreBornSumGpuSim copy to feSim failed");
 }

 // create, initialize and enter BornRadiusScaleFactors values (used to scale contribution of atoms to Born sum of other atoms)
-// return handle to GpuGBVISoftcore object

 extern "C"
-GpuGBVISoftcore* gpuSetGBVISoftcoreParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom,
+void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom,
                                   const std::vector<float>& radius, const std::vector<float>& gamma,
                                   const std::vector<float>& scaledRadii, const std::vector<float>& bornRadiusScaleFactors,
-                                              const std::vector<float>& quinticSplineParameters)
-{
+                                   const std::vector<float>& quinticSplineParameters ){
+
+    unsigned int numberOfParticles                    = radius.size();
+
+    gpuContext gpu                                    = freeEnergyGpu->gpuContext;
+
    static const float electricConstant               = -166.02691f;
-    unsigned int atoms                          = atom.size();
    double tau                                        = ((1.0f/innerDielectric)-(1.0f/solventDielectric)); 
+    freeEnergyGpu->psSwitchDerivative                 = new CUDAStream<float>( numberOfParticles, 1, "SwitchDerivative");
+    freeEnergyGpu->freeEnergySim.pSwitchDerivative    = freeEnergyGpu->psSwitchDerivative->_pDevData;

    // create gpuGBVISoftcore, load parameters, and track minimum softcore value
    // gpuGBVISoftcore is not really being used (it was in the initial implementation) -- 
    // will be removed in future once confirmed not needed

-    GpuGBVISoftcore* gpuGBVISoftcore            = new GpuGBVISoftcore();
-    unsigned int numberOfParticles              = radius.size();

    // check if quintic scaling to be applied

    if( quinticSplineParameters.size() == 2 ){
-       gpuGBVISoftcore->setBornRadiiScalingMethod( 1 );
-       gpuGBVISoftcore->setQuinticLowerLimitFactor( quinticSplineParameters[0] );
-       gpuGBVISoftcore->setQuinticUpperLimit(       quinticSplineParameters[1] );
-       gpuGBVISoftcore->initializeGpuSwitchDerivative(  gpu->sim.paddedNumberOfAtoms );
+       freeEnergyGpu->freeEnergySim.bornRadiiScalingMethod        = 1;
+       freeEnergyGpu->freeEnergySim.quinticLowerLimitFactor       = quinticSplineParameters[0];
+       freeEnergyGpu->freeEnergySim.quinticUpperLimit             = quinticSplineParameters[1];
+    } else {
+       freeEnergyGpu->freeEnergySim.bornRadiiScalingMethod        = 0;
+       freeEnergyGpu->freeEnergySim.quinticLowerLimitFactor       = 0.8f;
+       freeEnergyGpu->freeEnergySim.quinticUpperLimit             = 5.0f;
    }

-    for (unsigned int i = 0; i < bornRadiusScaleFactors.size(); i++) 
-    {
-            (*gpu->psGBVIData)[i].x = radius[i];
-            (*gpu->psGBVIData)[i].y = scaledRadii[i];
-            (*gpu->psGBVIData)[i].z = tau*gamma[i];
-            (*gpu->psGBVIData)[i].w = bornRadiusScaleFactors[i];
-
-(*gpu->psObcData)[i].x  = radius[i];
-(*gpu->psObcData)[i].y  = 0.9f*radius[i];
-
+    for( unsigned int ii = 0; ii < bornRadiusScaleFactors.size(); ii++ ){
+            (*gpu->psGBVIData)[ii].x = radius[ii];
+            (*gpu->psGBVIData)[ii].y = scaledRadii[ii];
+            (*gpu->psGBVIData)[ii].z = tau*gamma[ii];
+            (*gpu->psGBVIData)[ii].w = bornRadiusScaleFactors[ii];
    }

    // Dummy out extra atom data

-    for (unsigned int i = atoms; i < gpu->sim.paddedNumberOfAtoms; i++)
-    {
-        (*gpu->psBornRadii)[i]      = 0.2f;
-        (*gpu->psGBVIData)[i].x     = 0.01f;
-        (*gpu->psGBVIData)[i].y     = 0.01f;
-        (*gpu->psGBVIData)[i].z     = 0.01f;
-        (*gpu->psGBVIData)[i].w     = 1.00f;
+    for( unsigned int ii = bornRadiusScaleFactors.size(); ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
+        (*gpu->psGBVIData)[ii].x     = 0.01f;
+        (*gpu->psGBVIData)[ii].y     = 0.01f;
+        (*gpu->psGBVIData)[ii].z     = 0.0f;
+        (*gpu->psGBVIData)[ii].w     = 0.0f;
    }

-#undef DUMP_PARAMETERS
-#define DUMP_PARAMETERS 0
-#if (DUMP_PARAMETERS == 1)
-    (void) fprintf( stderr,"GBVI softcore param %u %u sclMeth=%d LwFct=%8.3f UpLmt=[%12.5e (nm) %12.5e]\nR scaledR gamma*tau= bornRadiusScaleFactor \n",
-                    bornRadiusScaleFactors.size(), gpu->sim.paddedNumberOfAtoms,
-                    gpuGBVISoftcore->getBornRadiiScalingMethod(), gpuGBVISoftcore->getQuinticLowerLimitFactor(),
-                    powf( gpuGBVISoftcore->getQuinticUpperLimit(), -0.3333333f ), gpuGBVISoftcore->getQuinticUpperLimit() );
-    int maxPrint = 31;
-    for (unsigned int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++) 
-    {
+    gpu->sim.preFactor               = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
+
+    // diagnostics

-        (void) fprintf( stderr,"%6u %14.7e %14.7e %14.7e %14.7e\n",
+    if( freeEnergyGpu->log ){
+        (void) fprintf( freeEnergyGpu->log,"GBVISoftcore: part.=%u padded=%u sclMeth=%d\n",
+                        static_cast<unsigned int>(bornRadiusScaleFactors.size()), static_cast<unsigned int>(gpu->sim.paddedNumberOfAtoms),
+                        freeEnergyGpu->freeEnergySim.bornRadiiScalingMethod );
+        if( quinticSplineParameters.size() == 2 ){
+            (void) fprintf( freeEnergyGpu->log,"QuinticScaling: LwFct=%8.3f UpLmt=[%12.5e (nm) %12.5e]\n",
+                        freeEnergyGpu->freeEnergySim.quinticLowerLimitFactor,
+                        powf( freeEnergyGpu->freeEnergySim.quinticUpperLimit, -0.3333333f ), freeEnergyGpu->freeEnergySim.quinticUpperLimit );
+        }
+        (void) fprintf( freeEnergyGpu->log, "gpuSetGBVISoftcoreParameters: preFactor=%14.6e elecCnstnt=%.4f frcCnvrsnFctr=%.4f tau=%.4f.\n",
+                        gpu->sim.preFactor, 2.0f*electricConstant, gpu->sim.forceConversionFactor, ((1.0f/innerDielectric)-(1.0f/solventDielectric)) );
+#ifdef PARAMETER_PRINT
+        int maxPrint = MAX_PARAMETER_PRINT;
+        (void) fprintf( freeEnergyGpu->log, "               radius  scaled radius      tau*gamma         lambda\n" );
+        for( unsigned int ii = 0; ii < bornRadiusScaleFactors.size(); ii++ ){
+    
+            (void) fprintf( freeEnergyGpu->log,"%6u %14.7e %14.7e %14.7e %14.7e\n",
                            ii, (*gpu->psGBVIData)[ii].x, (*gpu->psGBVIData)[ii].y, (*gpu->psGBVIData)[ii].z, (*gpu->psGBVIData)[ii].w ); 
            if( ii == maxPrint ){
-            ii = gpu->sim.paddedNumberOfAtoms - maxPrint;
+                ii = bornRadiusScaleFactors.size() - maxPrint;
                if( ii < maxPrint )ii = maxPrint;
            }
        }
+        unsigned int offset = gpu->sim.paddedNumberOfAtoms - MAX_PARAMETER_PRINT;
+        if( offset > 0 && gpu->sim.paddedNumberOfAtoms > bornRadiusScaleFactors.size()  ){
+            for( unsigned int ii = offset; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
+                (void) fprintf( freeEnergyGpu->log,"%6u %14.7e %14.7e %14.7e %14.7e\n",
+                                ii, (*gpu->psGBVIData)[ii].x, (*gpu->psGBVIData)[ii].y, (*gpu->psGBVIData)[ii].z, (*gpu->psGBVIData)[ii].w ); 
+            }
+        }
 #endif
+    }

-    gpu->psBornRadii->Upload();
    gpu->psGBVIData->Upload();
-gpu->psObcData->Upload();
-    gpu->sim.preFactor              = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
-    gpuGBVISoftcore->upload( gpu );
-
-#if (DUMP_PARAMETERS == 1)
-(void) fprintf( stderr, "gpuSetGBVISoftcoreParameters: preFactor=%14.6e elecCnstnt=%.4f frcCnvrsnFctr=%.4f tau=%.4f.\n",
-                gpu->sim.preFactor, 2.0f*electricConstant, gpu->sim.forceConversionFactor, ((1.0f/innerDielectric)-(1.0f/solventDielectric)) );
-#endif
-
-    return gpuGBVISoftcore;

+    return;
 }

 struct Atom {
@@ -224,7 +211,9 @@ __global__ void kReduceGBVISoftcoreBornForces_kernel()

        float ratio         = (gbviData.x/bornRadius);
        float ratio3        = ratio*ratio*ratio;
-        energy             -= gbviData.z*ratio3;
+
+        energy             -= gbviData.z*ratio3;                   //  gbviData.z = gamma*tau
+
        totalForce         += (3.0f*gbviData.z*ratio3)/bornRadius; // 'cavity' term
        float br2           = bornRadius*bornRadius;
        totalForce         *= (1.0f/3.0f)*br2*br2;
@@ -236,8 +225,9 @@ __global__ void kReduceGBVISoftcoreBornForces_kernel()
    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy;
 }

-void kReduceGBVISoftcoreBornForces(gpuContext gpu)
+void kReduceGBVISoftcoreBornForces( freeEnergyGpuContext freeEnergyGpu )
 {
+    gpuContext gpu = freeEnergyGpu->gpuContext;
    kReduceGBVISoftcoreBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
    LAUNCHERROR("kReduceGBVISoftcoreBornForces");

@@ -257,7 +247,6 @@ __global__ void kReduceGBVISoftcoreBornSum_kernel()
        for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
        {
            sum += *pSt;
-       //     printf("%4d %4d A: %9.4f\n", pos, i, *pSt);
            pSt += cSim.stride;
        }
        
@@ -270,33 +259,10 @@ __global__ void kReduceGBVISoftcoreBornSum_kernel()
    }   
 }

-void kReduceGBVISoftcoreBornSum(gpuContext gpu)
+void kReduceGBVISoftcoreBornSum( freeEnergyGpuContext freeEnergyGpu )
 {
-    //printf("kReduceGBVISoftcoreBornSum\n");
-#define GBVISoftcore_DEBUG 0
-#if ( GBVISoftcore_DEBUG == 1 )
-               gpu->psGBVISoftcoreData->Download();
-               gpu->psBornSum->Download();
-               gpu->psPosq4->Download();
-                (void) fprintf( stderr, "\nkReduceGBVISoftcoreBornSum: Post BornSum %s Born radii & params\n", 
-                               (gpu->bIncludeGBVISoftcore ? "GBVI" : "Obc") );
-                for( int ii = 0; ii < gpu->natoms; ii++ ){
-                   (void) fprintf( stderr, "%d bSum=%14.6e param[%14.6e %14.6e %14.6e] x[%14.6f %14.6f %14.6f %14.6f]\n",
-                                   ii, 
-                                   gpu->psBornSum->_pSysStream[0][ii],
-                                   gpu->psGBVISoftcoreData->_pSysStream[0][ii].x,
-                                   gpu->psGBVISoftcoreData->_pSysStream[0][ii].y,
-                                   gpu->psGBVISoftcoreData->_pSysStream[0][ii].z,
-                                   gpu->psPosq4->_pSysStream[0][ii].x, gpu->psPosq4->_pSysStream[0][ii].y,
-                                   gpu->psPosq4->_pSysStream[0][ii].z, gpu->psPosq4->_pSysStream[0][ii].w
-                                 );  
-                }   
-#endif
-#undef GBVISoftcore_DEBUG
-
-
+    gpuContext gpu = freeEnergyGpu->gpuContext;
    kReduceGBVISoftcoreBornSum_kernel<<<gpu->sim.blocks, 384>>>();
-    gpu->bRecalculateBornRadii = false;
    LAUNCHERROR("kReduceGBVISoftcoreBornSum");
 }

@@ -311,7 +277,6 @@ void kReduceGBVISoftcoreBornSum(gpuContext gpu)

 // Include versions of the kernels with cutoffs.

-#if 0
 #undef METHOD_NAME
 #undef USE_OUTPUT_BUFFER_PER_WARP
 #define USE_CUTOFF
@@ -333,19 +298,6 @@ void kReduceGBVISoftcoreBornSum(gpuContext gpu)
 #undef METHOD_NAME
 #define METHOD_NAME(a, b) a##PeriodicByWarp##b
 #include "kCalculateGBVISoftcoreBornSum.h"
-#endif
-
-#if 0
-__global__ void kClearGBVISoftcoreBornSum_kernel()
-{
-    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    while (pos < cSim.stride * cSim.nonbondOutputBuffers)
-    {
-        ((float*)cSim.pBornSum)[pos] = 0.0f;
-        pos += gridDim.x * blockDim.x;
-    }
-}
-#endif

 __device__ void quinticSpline( float  x, float rl, float ru, float* outValue, float* outDerivative )
 {
@@ -373,7 +325,6 @@ __global__ void kReduceGBVIBornSumQuinticScaling_kernel()
        for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
        {
            sum += *pSt;
-       //     printf("%4d %4d A: %9.4f\n", pos, i, *pSt);
            pSt += cSim.stride;
        }
        
@@ -382,7 +333,6 @@ __global__ void kReduceGBVIBornSumQuinticScaling_kernel()
        float Rinv           = 1.0f/atom.x;
        float r3             = Rinv*Rinv*Rinv;
        float splineL        = gbviSimDev.quinticLowerLimitFactor*r3;
-//float bSum           = sum;
        float switchDeriviative;
        if( sum > splineL ){
            if( sum < r3 ){
@@ -400,47 +350,16 @@ __global__ void kReduceGBVIBornSumQuinticScaling_kernel()
        }

        cSim.pBornRadii[pos]              = pow( sum, (-1.0f/3.0f) ); 
-//cSim.pBornSum[pos]              = bSum;
        gbviSimDev.pSwitchDerivative[pos] = switchDeriviative;
        pos += gridDim.x * blockDim.x;
    }   
 }

-void kReduceGBVIBornSumQuinticScaling(gpuContext gpu, GpuGBVISoftcore* gpuGBVISoftcore)
+void kReduceGBVIBornSumQuinticScaling( freeEnergyGpuContext freeEnergyGpu )
 {
-    //printf("kReduceGBVIBornSumQuinticScaling_kernel\n");
+    gpuContext gpu = freeEnergyGpu->gpuContext;
    kReduceGBVIBornSumQuinticScaling_kernel<<<gpu->sim.blocks, 384>>>();
-    gpu->bRecalculateBornRadii = false;
    LAUNCHERROR("kReduceGBVIBornSumQuinticScaling_kernel");
-
-#define GBVI_DEBUG 0
-#if ( GBVI_DEBUG == 1 )
-               gpu->psGBVIData->Download();
-               gpu->psBornSum->Download();
-               gpu->psBornRadii->Download();
-               gpu->psPosq4->Download();
-               CUDAStream<float>* psSwitchDerivative = gpuGBVISoftcore->getSwitchDerivative();
-                
-               psSwitchDerivative->Download();
-                (void) fprintf( stderr, "\nkReduceGBVIBornSumQuinticScaling: Post BornSum %s Born radii & params\n", 
-                               (gpu->bIncludeGBVI ? "GBVI" : "Obc") );
-                for( int ii = 0; ii < gpu->natoms; ii++ ){
-                   (void) fprintf( stderr, "%6d bSum=%14.6e bR=%14.6e swDerv=%14.6e param[%14.6e %14.6e %14.6e] x[%14.6f %14.6f %14.6f %14.6f] %s\n",
-                                   ii, 
-                                   gpu->psBornSum->_pSysStream[0][ii],
-                                   gpu->psBornRadii->_pSysStream[0][ii],
-                                   psSwitchDerivative->_pSysStream[0][ii],
-                                   gpu->psGBVIData->_pSysStream[0][ii].x,
-                                   gpu->psGBVIData->_pSysStream[0][ii].y,
-                                   gpu->psGBVIData->_pSysStream[0][ii].z,
-                                   gpu->psPosq4->_pSysStream[0][ii].x, gpu->psPosq4->_pSysStream[0][ii].y,
-                                   gpu->psPosq4->_pSysStream[0][ii].z, gpu->psPosq4->_pSysStream[0][ii].w,
-                                   (fabs( psSwitchDerivative->_pSysStream[0][ii] - 1.0 ) > 1.0e-05 ? "SWWWWW" : "")
-                                 );  
-                }   
-#endif
-#undef GBVI_DEBUG
-
 }

 __global__ void kReduceGBVIBornForcesQuinticScaling_kernel()
@@ -497,53 +416,108 @@ __global__ void kReduceGBVIBornForcesQuinticScaling_kernel()
    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy;
 }

-void kReduceGBVIBornForcesQuinticScaling(gpuContext gpu)
+void kReduceGBVIBornForcesQuinticScaling( freeEnergyGpuContext freeEnergyGpu )
 {
    //printf("kReduceObcGbsaBornForces\n");
+    gpuContext gpu = freeEnergyGpu->gpuContext;
    kReduceGBVIBornForcesQuinticScaling_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
    LAUNCHERROR("kReduceGBVIBornForcesQuinticScaling");
 }

-void kPrintGBVISoftcore(gpuContext gpu, GpuGBVISoftcore* gpuGBVISoftcore, std::string callId, int call)
+void kPrintGBVISoftcore( freeEnergyGpuContext freeEnergyGpu, std::string callId, int call, FILE* log)
 {
-    int maxPrint = 20;
-    (void) fprintf( stderr, "kPrintGBVgSoftcore %s %d\n", callId.c_str(), call );
+    gpuContext gpu = freeEnergyGpu->gpuContext;
+    //int maxPrint   = gpu->natoms;
+
    gpu->psGBVIData->Download();
    gpu->psBornRadii->Download();
    gpu->psBornForce->Download();
    gpu->psPosq4->Download();
-    CUDAStream<float>* switchDeriviative = gpuGBVISoftcore-> getSwitchDerivative( );
+
+    CUDAStream<float>* switchDeriviative = freeEnergyGpu->psSwitchDerivative;
+    CUDAStream<float4>* sigEps4          = freeEnergyGpu->psSigEps4;
+
    switchDeriviative->Download();
+    sigEps4->Download();

-    (void) fprintf( stderr, "BornSum Born radii & params\n" );
+    (void) fprintf( log, "kPrintGBViSoftcore Cuda comp bR bF swd   prm    sigeps4\n" );
    for( int ii = 0; ii < gpu->natoms; ii++ ){
-        (void) fprintf( stderr, "%6d prm[%14.6e %14.6e %14.6e] bR=%14.6e bF=%14.6e swDrv=%14.6e x[%14.6f %14.6f %14.6f %14.6f]\n",
+        (void) fprintf( log, "%6d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e \n",
                        ii, 
-                        gpu->psGBVIData->_pSysStream[0][ii].x,
-                        gpu->psGBVIData->_pSysStream[0][ii].y,
-                        gpu->psGBVIData->_pSysStream[0][ii].z,
-                        gpu->psBornRadii->_pSysStream[0][ii],
-                        gpu->psBornForce->_pSysStream[0][ii],
-                        switchDeriviative->_pSysStream[0][ii],
-                        gpu->psPosq4->_pSysStream[0][ii].x, gpu->psPosq4->_pSysStream[0][ii].y,
-                        gpu->psPosq4->_pSysStream[0][ii].z, gpu->psPosq4->_pSysStream[0][ii].w );
-        if( (ii == maxPrint) && ( ii < (gpu->natoms - maxPrint)) ){
-            ii = gpu->natoms - maxPrint;
-        }
+                        gpu->psBornRadii->_pSysData[ii],
+                        gpu->psBornForce->_pSysData[ii],
+                        switchDeriviative->_pSysData[ii],
+
+                        gpu->psGBVIData->_pSysData[ii].x,
+                        gpu->psGBVIData->_pSysData[ii].y,
+                        gpu->psGBVIData->_pSysData[ii].z,
+                        gpu->psGBVIData->_pSysData[ii].w,
+
+                        sigEps4->_pSysData[ii].x,
+                        sigEps4->_pSysData[ii].y,
+                        sigEps4->_pSysData[ii].z,
+                        sigEps4->_pSysData[ii].w );
+
    }
+
 }

-void kCalculateGBVISoftcoreBornSum(gpuContext gpu)
+extern __global__ void kFindBlockBoundsCutoff_kernel();
+extern __global__ void kFindBlockBoundsPeriodic_kernel();
+
+extern __global__ void kFindBlocksWithInteractionsCutoff_kernel();
+extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
+
+extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*);
+extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*);
+
+void kCalculateGBVISoftcoreBornSum( freeEnergyGpuContext freeEnergyGpu )
 {
-    //printf("kCalculateGBVIBornSum\n");
+
+    gpuContext gpu = freeEnergyGpu->gpuContext;
+
+#ifdef DEBUG
+fprintf( stderr, "kCalculateCDLJObcGbsaSoftcoreForces1 cutoff=%15.7e\n", gpu->sim.nonbondedCutoffSqr );
+int psize = gpu->sim.paddedNumberOfAtoms;
+CUDAStream<float4>* pdE1 = new CUDAStream<float4>( psize, 1, "pdE");
+CUDAStream<float4>* pdE2 = new CUDAStream<float4>( psize, 1, "pdE");
+float bF; 
+float bF1; 
+showWorkUnitsFreeEnergy( freeEnergyGpu, 1 );
+
+for( int ii = 0; ii < psize; ii++ ){
+
+pdE1->_pSysData[ii].x = 0.0f;
+pdE1->_pSysData[ii].y = 0.001f;
+pdE1->_pSysData[ii].z = 0.001f;
+pdE1->_pSysData[ii].w = 0.001f;
+
+pdE2->_pSysData[ii].x = 0.001f;
+pdE2->_pSysData[ii].y = 0.001f;
+pdE2->_pSysData[ii].z = 0.001f;
+pdE2->_pSysData[ii].w = 0.001f;
+}
+pdE1->Upload();
+pdE2->Upload();
+
+#endif
+
    kClearGBVISoftcoreBornSum( gpu );
    LAUNCHERROR("kClearGBVIBornSum from kCalculateGBVISoftcoreBornSum");

-    //size_t numWithInteractions;
-    switch (gpu->sim.nonbondedMethod)
+    switch (freeEnergyGpu->freeEnergySim.nonbondedMethod)
    {   
-        case NO_CUTOFF:
+        case FREE_ENERGY_NO_CUTOFF:

+#ifdef DEBUG
+            if (gpu->bOutputBufferPerWarp){
+                kCalculateGBVISoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
+            } else {
+                kCalculateGBVISoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
+            }
+#else
            if (gpu->bOutputBufferPerWarp){
                kCalculateGBVISoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
@@ -551,25 +525,93 @@ void kCalculateGBVISoftcoreBornSum(gpuContext gpu)
                kCalculateGBVISoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
            }
+#endif
+
            break;
-#if 0
-        case CUTOFF:
+
+        case FREE_ENERGY_CUTOFF:
+
+            kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsCutoff");
+            kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
+            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
+            kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+
+#ifdef DEBUG
+            (void) fprintf( stderr, "kCalculateGBVISoftcoreBornSum cutoff=%15.7e warp=%u GridBoundingBox.length=%u interaction_blocks=%u interaction_threads_per_block=%u nonbond_blocks=%u nonbond_threads_per_block=%u\n",
+                            gpu->sim.nonbondedCutoffSqr, gpu->bOutputBufferPerWarp, gpu->psGridBoundingBox->_length, gpu->sim.interaction_blocks,
+                            gpu->sim.interaction_threads_per_block, gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block ); fflush( stderr );
+
+
            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVICutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                kCalculateGBVISoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
+            else
+                kCalculateGBVISoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData );
+
+#else
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateGBVISoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
            else
-                kCalculateGBVICutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                kCalculateGBVISoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
            break;
-        case PERIODIC:
+#endif
+
+        case FREE_ENERGY_PERIODIC:
+
+            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsPeriodic");
+            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
+            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
+            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+
+#ifdef DEBUG
            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVIPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                kCalculateGBVISoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData  );
+            else
+                kCalculateGBVISoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData  );
+#else
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateGBVISoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
            else
-                kCalculateGBVIPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                kCalculateGBVISoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
-            break;
 #endif
+            break;
+
+        default:
+            throw OpenMM::OpenMMException( "Nonbonded softcore method not recognized." );
+
    }
    LAUNCHERROR("kCalculateGBVISoftcoreBornSum");
+
+#ifdef DEBUG
+pdE1->Download();
+pdE2->Download();
+fprintf( stderr, "bSum Cud method=%u warp=%u\n", freeEnergyGpu->freeEnergySim.nonbondedMethod, gpu->bOutputBufferPerWarp );
+bF  = 0.0;
+bF1 = 0.0;
+for( int ii = 0; ii < gpu->natoms; ii++ ){
+    if( fabsf( pdE1->_pSysData[ii].w ) > 0.002 ){
+        bF1 += pdE1->_pSysData[ii].x;
+        if( fabsf( pdE1->_pSysData[ii].x ) > 0.001 ){
+            fprintf( stderr, "%4d %15.7e %15.7e %15.7e %15.7e    %15.7e %15.7e %15.7e %15.7e\n", ii,
+                     pdE1->_pSysData[ii].x, pdE1->_pSysData[ii].y, pdE1->_pSysData[ii].z, pdE1->_pSysData[ii].w,
+                     pdE2->_pSysData[ii].x, pdE2->_pSysData[ii].y, pdE2->_pSysData[ii].z, pdE2->_pSysData[ii].w );
+        }
+    }
+    bF += pdE1->_pSysData[ii].x;
+}
+fprintf( stderr, "bSum Cud %6d %15.7e %15.7e\n", TARGET, bF, bF1 );
+#endif
 }
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.h
@@ -37,7 +37,22 @@

 #include "kCalculateGBVIAux.h"

-__global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int* workUnit)
+#undef TARGET
+//#define TARGET 5443
+
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
+#endif
+#ifdef DEBUG
+void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int* workUnit, float4* pdE1, float4* pdE2 )
+#else
+void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int* workUnit)
+#endif
 {
    extern __shared__ Atom sA[];

@@ -47,8 +62,6 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int
    unsigned int pos          = warp*numWorkUnits/totalWarps;
    unsigned int end          = (warp+1)*numWorkUnits/totalWarps;

-//    int end = workUnits / gridDim.x;
-//    int pos = end - (threadIdx.x >> GRIDBITS) - 1;
 #ifdef USE_CUTOFF
    float* tempBuffer         = (float*) &sA[cSim.nonbond_threads_per_block];
 #endif
@@ -85,7 +98,7 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int
            sA[threadIdx.x].r                       = ar.x;
            sA[threadIdx.x].sr                      = ar.y;
            sA[threadIdx.x].bornRadiusScaleFactor   = ar.w;
-            apos.w                                  = 0.0f;
+            float bSum                              = 0.0f;

            for (unsigned int j = 0; j < GRID; j++)
            {
@@ -98,55 +111,80 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int
                dz                                 -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
 #endif
                r2                                  = dx * dx + dy * dy + dz * dz;
-#if defined USE_PERIODIC
-                if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#elif defined USE_CUTOFF
-                if (r2 < cSim.nonbondedCutoffSqr)
+#if defined USE_CUTOFF
+                if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr && j != tgx)
+#else
+                if (i < cSim.atoms && x+j < cSim.atoms && j != tgx )
 #endif
                {
-                    r                       = sqrt(r2);
-                    if ((j != tgx) )
-                    {
-                        apos.w             += psA[j].bornRadiusScaleFactor*getGBVI_Volume( r, ar.x, psA[j].sr );
-                    }
+                    bSum  += psA[j].bornRadiusScaleFactor*getGBVI_Volume( sqrt(r2), ar.x, psA[j].sr );
+
+#ifdef DEBUG
+int jIdx = j;
+if( i == TARGET ){
+int tjj     = y+jIdx;
+pdE1[tjj].x = psA[jIdx].bornRadiusScaleFactor*getGBVI_Volume( sqrt(r2), ar.x, psA[jIdx].sr );
+pdE1[tjj].y = psA[jIdx].bornRadiusScaleFactor;
+pdE1[tjj].z = ar.x;
+pdE1[tjj].w = 1.0f;
+pdE2[tjj].x = sqrt(r2);
+pdE2[tjj].y = psA[jIdx].sr;
+pdE2[tjj].z = ar.x;
+pdE2[tjj].w = 1.0f;
+}
+if( (y+jIdx) == TARGET ){
+int tjj     = i;
+pdE1[tjj].x =  psA[jIdx].bornRadiusScaleFactor*getGBVI_Volume( sqrt(r2), ar.x, psA[jIdx].sr );
+pdE1[tjj].y =  psA[jIdx].bornRadiusScaleFactor;
+pdE1[tjj].z = ar.x;
+pdE1[tjj].w = -1.0f;
+} 
+#endif
+
                }
            }

            // Write results
 #ifdef USE_OUTPUT_BUFFER_PER_WARP
            unsigned int offset = x + tgx + warp*cSim.stride;
-            cSim.pBornSum[offset] += apos.w;
+            cSim.pBornSum[offset] += bSum;
 #else
            unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = apos.w;
+            cSim.pBornSum[offset] = bSum;
 #endif
-        }
-        else        // 100% utilization
-        {
+
+
+        } else {
+
            // Read fixed atom data into registers and GRF
            unsigned int j                              = y + tgx;
            unsigned int i                              = x + tgx;

            float4 temp                                 = cSim.pPosq[j];
            float4 temp1                                = cSim.pGBVIData[j];
+
            float4 apos                                 = cSim.pPosq[i];        // Local atom x, y, z, sum
            float4 ar                                   = cSim.pGBVIData[i];    // Local atom vr, sr
+
            sA[threadIdx.x].x                           = temp.x;
            sA[threadIdx.x].y                           = temp.y;
            sA[threadIdx.x].z                           = temp.z;
+
            sA[threadIdx.x].r                           = temp1.x;
            sA[threadIdx.x].sr                          = temp1.y;
            sA[threadIdx.x].bornRadiusScaleFactor       = temp1.w;
-            sA[threadIdx.x].sum             = apos.w    = 0.0f;
+
+            sA[threadIdx.x].sum                         = 0.0f;
+            apos.w                                      = 0.0f;

 #ifdef USE_CUTOFF
-            //unsigned int flags = cSim.pInteractionFlag[pos + (blockIdx.x*workUnits)/gridDim.x];
            unsigned int flags = cSim.pInteractionFlag[pos];
            if (flags == 0)
            {
                // No interactions in this block.
            }
            else if (flags == 0xFFFFFFFF)
+            //else if (flags )
 #endif
            {
                // Compute all interactions within this block.
@@ -162,10 +200,10 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int
                    dz                     -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
 #endif
                    r2                      = dx * dx + dy * dy + dz * dz;
-#ifdef USE_PERIODIC
+#ifdef USE_CUTOFF
                    if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#elif defined USE_CUTOFF
-                    if (r2 < cSim.nonbondedCutoffSqr)
+#else
+                    if (i < cSim.atoms && y+tj < cSim.atoms )
 #endif
                    {
                        r                       = sqrt(r2);
@@ -175,9 +213,37 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int

                        apos.w                 += psA[tj].bornRadiusScaleFactor*getGBVI_Volume( r, ar.x, psA[tj].sr );
                        psA[tj].sum            += ar.w*getGBVI_Volume( r, psA[tj].r, ar.y );
+
+#ifdef DEBUG
+int jIdx = tj;
+if( i == TARGET ){
+
+int tjj     = y+jIdx;
+pdE1[tjj].x = psA[jIdx].bornRadiusScaleFactor*getGBVI_Volume( r, ar.x, psA[jIdx].sr );
+pdE1[tjj].y = psA[jIdx].bornRadiusScaleFactor;
+pdE1[tjj].z = ar.x;
+pdE1[tjj].w = 2.0f;
+
+float R =  ar.x;
+float S =  psA[tj].sr;
+pdE2[tjj].x = getGBVI_L( r, (r + S), S );
+pdE2[tjj].y = -getGBVI_L( r, (r - S), S );
+pdE2[tjj].z = -getGBVI_L( r, R, S );
+pdE2[tjj].w = (1.0f/(R*R*R));
+
+}
+if( (y+jIdx) == TARGET ){
+int tjj     = i;
+pdE1[tjj].x = ar.w*getGBVI_Volume( r, psA[jIdx].r, ar.y );
+pdE1[tjj].y = ar.w;
+pdE1[tjj].z = psA[jIdx].r;
+pdE1[tjj].w = -2.0f;
+}
+#endif
                    }
                    tj = (tj - 1) & (GRID - 1);
                }
+
            }
 #ifdef USE_CUTOFF
            else
@@ -198,14 +264,15 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int
                        dz                     -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
 #endif
                        r2                      = dx * dx + dy * dy + dz * dz;
-#ifdef USE_PERIODIC
+#ifdef USE_CUTOFF
                        if (i < cSim.atoms && y+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#elif defined USE_CUTOFF
-                        if (r2 < cSim.nonbondedCutoffSqr)
+#else
+                        if (i < cSim.atoms && y+j < cSim.atoms)
 #endif
                        {
                            r                       = sqrt(r2);
-                            tempBuffer[threadIdx.x] = ar.w*getGBVI_Volume( r, psA[tj].r, ar.y );
+                            tempBuffer[threadIdx.x] = ar.w*getGBVI_Volume( r, psA[j].r, ar.y );
+                            apos.w                 += psA[j].bornRadiusScaleFactor*getGBVI_Volume( r, ar.x, psA[j].sr );
                        }

                        // Sum the terms.
@@ -226,6 +293,7 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int
 #endif

            // Write results
+
 #ifdef USE_OUTPUT_BUFFER_PER_WARP
            unsigned int offset = x + tgx + warp*cSim.stride;
            cSim.pBornSum[offset] += apos.w;
@@ -237,6 +305,7 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int
            offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
            cSim.pBornSum[offset] = sA[threadIdx.x].sum;
 #endif
+
        }

        pos++;

--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.cu
@@ -57,22 +57,14 @@ struct Atom {

 static __constant__ cudaGmxSimulation cSim;

-void SetCalculateGBVISoftcoreForces2Sim(gpuContext gpu)
+void SetCalculateGBVISoftcoreForces2Sim( freeEnergyGpuContext gpu)
 {
    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-    //(void) fprintf( stderr, "SetCalculateGBVISoftcoreForces2Sim called.\n" );
+    status = cudaMemcpyToSymbol(cSim, &gpu->gpuContext->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateGBVISoftcoreForces2Sim copy to cSim failed");
 }

-void GetCalculateGBVISoftcoreForces2Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-
-#include "kCalculateGBVISoftcoreAux.h"
+#include "kCalculateGBVIAux.h"

 /**
 * This file contains the kernel for evalauating the second stage of GBSA.  It is included
@@ -134,6 +126,8 @@ __global__ void kCalculateGBVISoftcoreForces2a_kernel()

 }

+#define TARGET 0
+
 // Include versions of the kernels for N^2 calculations.

 #define METHOD_NAME(a, b) a##N2##b
@@ -167,78 +161,83 @@ __global__ void kCalculateGBVISoftcoreForces2a_kernel()
 #define METHOD_NAME(a, b) a##PeriodicByWarp##b
 #include "kCalculateGBVISoftcoreForces2.h"

-void kCalculateGBVISoftcoreForces2(gpuContext gpu)
+void kCalculateGBVISoftcoreForces2( freeEnergyGpuContext freeEnergyGpu )
 {
-    //printf("kCalculateGBVISoftcoreForces2\n");
-    size_t numWithInteractions;
-
-#if 0
-    kClearForces(gpu);
-    (void) fprintf( stderr, "\nkCalculateGBVISoftcoreForces2: cleared force prior loop2\n" ); (void) fflush( stderr );
-    kCalculateGBVISoftcoreForces2a_kernel<<<gpu->sim.blocks, 384>>>();
-    (void) fprintf( stderr, "\ncalled kCalculateGBVISoftcoreForces2a\n" ); (void) fflush( stderr );
-    return;
-#endif
+    gpuContext gpu = freeEnergyGpu->gpuContext;

-    switch (gpu->sim.nonbondedMethod)
+    /*fprintf( stderr,"kCalculateGBVISoftcoreForces2 nonbondedMethod=%d bornForce2_blocks=%d bornForce2_threads_per_block=%d\n",
+             freeEnergyGpu->freeEnergySim.nonbondedMethod,
+             gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block, gpu->psInteractionCount->_pSysData[0] ); fflush( stderr );
+    */
+
+    switch (freeEnergyGpu->freeEnergySim.nonbondedMethod)
    {
-        case NO_CUTOFF:
+        case FREE_ENERGY_NO_CUTOFF:
+
+#ifdef DEBUG
+int psize = 64;
+CUDAStream<float4>* pdE1 = new CUDAStream<float4>( psize, 1, "pdE");
+CUDAStream<float4>* pdE2 = new CUDAStream<float4>( psize, 1, "pdE");
+for( int ii = 0; ii < 32; ii++ ){
+
+pdE1->_pSysData[ii].x = 0.0f;
+pdE1->_pSysData[ii].y = 0.0f;
+pdE1->_pSysData[ii].z = 0.0f;
+pdE1->_pSysData[ii].w = 0.0f;
+
+pdE2->_pSysData[ii].x = 0.0f;
+pdE2->_pSysData[ii].y = 0.0f;
+pdE2->_pSysData[ii].z = 0.0f;
+pdE2->_pSysData[ii].w = 0.0f;
+}
+pdE1->Upload();
+pdE2->Upload();
+
            if (gpu->bOutputBufferPerWarp)
                kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits, pdE1->_pDevData, pdE2->_pDevData);
            else
                kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
-//(void) fprintf( stderr, "\nkCalculateGBVIForces2: Born radii/force forces warp=%u\n", gpu->bOutputBufferPerWarp ); (void) fflush( stderr );
-#define GBVI_DEBUG 0
-#if ( GBVI_DEBUG == 1 )
-                (void) fprintf( stderr, "\nkCalculateGBVISoftcoreForces2: Born radii/force forces:\n" ); (void) fflush( stderr );
-                gpu->psBornForce->Download();
-                gpu->psForce4->Download();
-                for( int ii = 0; ii < gpu->natoms; ii++ ){
-                    (void) fprintf( stderr, "%d bF=%14.6e Fa[%14.6e %14.6e %14.6e] Fb[%14.6e %14.6e %14.6e]\n",
-                                    ii,
-                                    gpu->psBornForce->_pSysStream[0][ii],
-                                    gpu->psForce4->_pSysStream[0][ii].x,
-                                    gpu->psForce4->_pSysStream[0][ii].y,
-                                    gpu->psForce4->_pSysStream[0][ii].z,
-                                    gpu->psForce4->_pSysStream[1][ii].x,
-                                    gpu->psForce4->_pSysStream[1][ii].y,
-                                    gpu->psForce4->_pSysStream[1][ii].z
-                                  );  
-                }   
-                for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms*2; ii++ ){
-                    (void) fprintf( stderr, "%d bF=%14.6e Fa[%14.6e %14.6e %14.6e %14.6e]\n",
-                                    ii,
-                                    gpu->psBornForce->_pSysStream[0][ii],
-                                    gpu->psForce4->_pSysStream[0][ii].x,
-                                    gpu->psForce4->_pSysStream[0][ii].y,
-                                    gpu->psForce4->_pSysStream[0][ii].z,
-                                    gpu->psForce4->_pSysStream[0][ii].w
-                                  );  
-                }   
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits, pdE1->_pDevData, pdE2->_pDevData);
+pdE1->Download();
+pdE2->Download();
+fprintf( stderr, "Pde\n" );
+for( int ii = 0; ii < 32; ii++ ){
+fprintf( stderr, "%4d %15.7e %15.7e %15.7e %15.7e    %15.7e %15.7e %15.7e %15.7e\n", ii, 
+         pdE1->_pSysData[ii].x, pdE1->_pSysData[ii].y, pdE1->_pSysData[ii].z, pdE1->_pSysData[ii].w,
+         pdE2->_pSysData[ii].x, pdE2->_pSysData[ii].y, pdE2->_pSysData[ii].z, pdE2->_pSysData[ii].w );
+}
+            break;
 #endif
-#undef GBVI_DEBUG

+            if (gpu->bOutputBufferPerWarp)
+                kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
+            else
+                kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
            break;
-        case CUTOFF:
-            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+
+        case FREE_ENERGY_CUTOFF:
+
            if (gpu->bOutputBufferPerWarp)
                kCalculateGBVISoftcoreCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
            else
                kCalculateGBVISoftcoreCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
            break;
-        case PERIODIC:
-            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+
+        case FREE_ENERGY_PERIODIC:
+
            if (gpu->bOutputBufferPerWarp)
                kCalculateGBVISoftcorePeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
            else
                kCalculateGBVISoftcorePeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
            break;
+
    }
    LAUNCHERROR("kCalculateGBVISoftcoreForces2");
 }
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.h
@@ -29,19 +29,29 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

-#include "kCalculateGBVISoftcoreAux.h"
+#include "kCalculateGBVIAux.h"

 /**
- * This file contains the kernel for evalauating the second stage of GBSA.  It is included
+ * This file contains the kernel for evaluating the second stage of GB/VI.  It is included
 * several times in kCalculateGBVIForces2.cu with different #defines to generate
 * different versions of the kernels.
 */

-__global__ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int* workUnit, unsigned int numWorkUnits)
+__global__
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1)
+#endif
+void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int* workUnit )
 {
+//METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int* workUnit, float4* pdE1, float4* pdE2 )
    extern __shared__ Atom sA[];
    unsigned int totalWarps   = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
    unsigned int warp         = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    unsigned int numWorkUnits = cSim.pInteractionCount[0];
    unsigned int pos          = warp*numWorkUnits/totalWarps;
    unsigned int end          = (warp+1)*numWorkUnits/totalWarps;
 #ifdef USE_CUTOFF
@@ -63,11 +73,17 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int
        float fb                        = cSim.pBornForce[i];
        unsigned int tbx                = threadIdx.x - tgx;
        unsigned int tj                 = tgx;
+
        Atom* psA                       = &sA[tbx];
+        sA[threadIdx.x].fx              = 0.0f;
+        sA[threadIdx.x].fy              = 0.0f;
+        sA[threadIdx.x].fz              = 0.0f;
+
        float3 af;
-        sA[threadIdx.x].fx = af.x   = 0.0f;
-        sA[threadIdx.x].fy = af.y   = 0.0f;
-        sA[threadIdx.x].fz = af.z   = 0.0f;
+        af.x                            = 0.0f;
+        af.y                            = 0.0f;
+        af.z                            = 0.0f;
+
        if (x == y) // Handle diagonals uniquely at 50% efficiency
        {
            // Read fixed atom data into registers and GRF
@@ -82,9 +98,11 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int

            for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1))
            {
+
                float dx                = psA[j].x - apos.x;
                float dy                = psA[j].y - apos.y;
                float dz                = psA[j].z - apos.z;
+
 #ifdef USE_PERIODIC
                dx                     -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
                dy                     -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
@@ -95,31 +113,43 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int

                // Atom I Born forces and sum
                float dE                = psA[j].bornRadiusScaleFactor*getGBVI_dE2( r, ar.x, psA[j].sr, fb );
-               
-#if defined USE_PERIODIC
-                if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-                {
-                    dE              = 0.0f;
-                }
-#endif
 #if defined USE_CUTOFF
-                if (r2 > cSim.nonbondedCutoffSqr)
+                if (i >= cSim.atoms || x+j >= cSim.atoms || (i == (x+j)) || r2 > cSim.nonbondedCutoffSqr)
+#else
+                if(i >= cSim.atoms || x+j >= cSim.atoms || (i == (x+j)) )
+#endif
                {
                    dE              = 0.0f;
                }
-#endif
+
+/*
+if( i == TARGET ){
+    pdE1[x+j].x = dE;
+    pdE1[x+j].y = psA[j].bornRadiusScaleFactor;
+    pdE1[x+j].z = r;
+    pdE1[x+j].w = dE1;
+}
+if( (x+j) == TARGET ){
+    pdE2[i].x = dE;
+    pdE2[i].y = psA[j].bornRadiusScaleFactor;
+    pdE2[i].z = r;
+    pdE2[i].w = psA[j].sr-ar.x;
+}*/
                float d             = dx * dE;
                af.x               -= d;
                psA[j].fx          += d;
+
                d                   = dy * dE;
                af.y               -= d;
                psA[j].fy          += d;
+
                d                   = dz * dE;
                af.z               -= d;
                psA[j].fz          += d;
            }

            // Write results
+
            float4 of;
 #ifdef USE_OUTPUT_BUFFER_PER_WARP
            unsigned int offset         = x + tgx + warp*cSim.stride;
@@ -181,18 +211,15 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int

                    float dE                = psA[tj].bornRadiusScaleFactor*getGBVI_dE2( r, ar.x, psA[tj].sr, fb );

-#if defined USE_PERIODIC
+#if defined USE_CUTOFF
                    if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-                    {
-                        dE                  = 0.0f;
-                    }
+#else
+                    if (i >= cSim.atoms || y+tj >= cSim.atoms )
 #endif
-#if defined USE_CUTOFF
-                    if (r2 > cSim.nonbondedCutoffSqr)
                    {
                        dE                  = 0.0f;
                    }
-#endif
+

                    float d                 = dx * dE;
                    af.x                   -= d;
@@ -207,18 +234,15 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int
                    // Atom J Born sum term
                    dE                      = ar.w*getGBVI_dE2( r, psA[tj].r, ar.y, psA[tj].fb );

-#ifdef USE_PERIODIC
+#if defined USE_CUTOFF
                    if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-                    {
-                        dE                  = 0.0f;
-                    }
+#else
+                    if (i >= cSim.atoms || y+tj >= cSim.atoms )
 #endif
-#if defined USE_CUTOFF
-                    if (r2 > cSim.nonbondedCutoffSqr)
                    {
                        dE                  = 0.0f;
                    }
-#endif
+
                    dx                     *= dE;
                    dy                     *= dE;
                    dz                     *= dE;
@@ -254,18 +278,14 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int
                        // Interleaved Atom I and J Born Forces and sum components
                        float dE                = psA[j].bornRadiusScaleFactor*getGBVI_dE2( r, ar.x, psA[j].sr, fb );

-#if defined USE_PERIODIC
+#if defined USE_CUTOFF
                        if (i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-                        {
-                            dE                  = 0.0f;
-                        }
+#else
+                        if (i >= cSim.atoms || y+j >= cSim.atoms )
 #endif
-#if defined USE_CUTOFF
-                        if (r2 > cSim.nonbondedCutoffSqr)
                        {
                            dE                  = 0.0f;
                        }
-#endif

                        float d                 = dx * dE;
                        af.x                   -= d;
@@ -280,18 +300,15 @@ __global__ void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int
                        // Atom J Born sum term
                        dE                      = ar.w*getGBVI_dE2( r, psA[j].r, ar.y, psA[j].fb );

-#ifdef USE_PERIODIC
+#if defined USE_CUTOFF
                        if (i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-                        {
-                            dE                  = 0.0f;
-                        }
+#else
+                        if (i >= cSim.atoms || y+j >= cSim.atoms )
 #endif
-#if defined USE_CUTOFF
-                        if (r2 > cSim.nonbondedCutoffSqr)
                        {
                            dE                  = 0.0f;
                        }
-#endif
+
                        dx                     *= dE;
                        dy                     *= dE;
                        dz                     *= dE;

--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateLocalSoftcoreForces.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateLocalSoftcoreForces.cu
@@ -24,454 +24,123 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

-#include "GpuLJ14Softcore.h"
 #include "GpuFreeEnergyCudaKernels.h"
-//#include <cuda.h>
+#include "freeEnergyGpuTypes.h"
+#include <cudatypes.h>
+#include "kSoftcoreLJ.h"
+
+#define PARAMETER_PRINT 1
+#define MAX_PARAMETER_PRINT 10

 static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaFreeEnergySimulationNonbonded14 feSim;
+static __constant__ cudaFreeEnergyGmxSimulation feSim;

-/* Cuda compiler on Windows does not recognized "static const float" values */
-#define LOCAL_HACK_PI 3.1415926535897932384626433832795
+void SetCalculateLocalSoftcoreGpuSim( freeEnergyGpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->gpuContext->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateLocalSoftcoreGpuSim copy to cSim failed");

-#define DOT3(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z)
+    status = cudaMemcpyToSymbol(feSim, &gpu->freeEnergySim, sizeof(cudaFreeEnergyGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateLocalSoftcoreGpuSim copy to feSim failed");

-#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
-{ \
-    dp          = DOT3(v1, v2); \
-    float norm1 = DOT3(v1, v1); \
-    float norm2 = DOT3(v2, v2); \
-    dp /= sqrt(norm1 * norm2); \
-    dp = min(dp, 1.0f); \
-    dp = max(dp, -1.0f); \
 }

-#define CROSS_PRODUCT(v1, v2, c) \
-    c.x = v1.y * v2.z - v1.z * v2.y; \
-    c.y = v1.z * v2.x - v1.x * v2.z; \
-    c.z = v1.x * v2.y - v1.y * v2.x;
+extern "C"
+void gpuSetLJ14SoftcoreParameters( freeEnergyGpuContext gpu, float epsfac, const std::vector<int>& atom1, const std::vector<int>& atom2,
+                                   const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& qProd,
+                                   const std::vector<float>& softcoreLJLambdaArray ){

-#define GETPREFACTORSGIVENANGLECOSINE(cosine, param, dEdR) \
-{ \
-   float angle          = acos(cosine); \
-   float deltaIdeal     = angle - (param.x * (LOCAL_HACK_PI / 180.0f)); \
-   dEdR                 = param.y * deltaIdeal; \
-}
+    unsigned int LJ14s                                  = atom1.size();
+    gpu->freeEnergySim.LJ14_count                       = LJ14s;

-#define GETENERGYGIVENANGLECOSINE(cosine, param, dEdR) \
-{ \
-   float angle          = acos(cosine); \
-   float deltaIdeal     = angle - (param.x * (LOCAL_HACK_PI / 180.0f)); \
-   dEdR                 = param.y * deltaIdeal * deltaIdeal; \
-}
+    gpu->psLJ14ID                                       = new CUDAStream<int4>(LJ14s, 1, "LJ14SoftcoreID");
+    CUDAStream<int4>* psLJ14ID                          = gpu->psLJ14ID;
+    gpu->freeEnergySim.pLJ14ID                          = psLJ14ID->_pDevData;

-#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
-{ \
-    float dp; \
-    GETNORMEDDOTPRODUCT(v1, v2, dp); \
-    angle = acos(dp); \
-}
+    gpu->psLJ14Parameter                                = new CUDAStream<float4>(LJ14s, 1, "LJ14SoftcoreParameter");
+    CUDAStream<float4>* psLJ14Parameter                 = gpu->psLJ14Parameter;
+    gpu->freeEnergySim.pLJ14Parameter  = psLJ14Parameter->_pDevData;

-#define GETANGLECOSINEBETWEENTWOVECTORS(v1, v2, angle, cosine) \
-{ \
-    GETNORMEDDOTPRODUCT(v1, v2, cosine); \
-    angle = acos(cosine); \
-}
+    std::vector<int> outputBufferCounter( gpu->gpuContext->sim.atoms, 0 );

-#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
-{ \
-    CROSS_PRODUCT(vector1, vector2, cp0); \
-    CROSS_PRODUCT(vector2, vector3, cp1); \
-    GETANGLEBETWEENTWOVECTORS(cp0, cp1, angle); \
-    float dp = DOT3(signVector, cp1); \
-    angle = (dp >= 0) ? angle : -angle; \
-}                                                          
+    for( int ii = 0; ii < LJ14s; ii++ ){

-#define GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle, cosine) \
-{ \
-    CROSS_PRODUCT(vector1, vector2, cp0); \
-    CROSS_PRODUCT(vector2, vector3, cp1); \
-    GETANGLECOSINEBETWEENTWOVECTORS(cp0, cp1, angle, cosine); \
-    float dp = DOT3(signVector, cp1); \
-    angle = (dp >= 0) ? angle : -angle; \
-}
+        (*psLJ14ID)[ii].x          = atom1[ii];
+        (*psLJ14ID)[ii].y          = atom2[ii];
+        (*psLJ14ID)[ii].z          = outputBufferCounter[atom1[ii]]++;
+        (*psLJ14ID)[ii].w          = outputBufferCounter[atom2[ii]]++;

-void SetCalculateLocalSoftcoreGpuSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateLocalSoftcoreGpuSim copy to cSim failed");
+        float p0, p1, p2, p3;
+        if( c12[ii] == 0.0f ){
+            p0 = 0.0f;
+            p1 = 1.0f;

-}
+        } else {
+            p0 = c6[ii] * c6[ii] / c12[ii];
+            p1 = pow(c12[ii] / c6[ii], 1.0f / 6.0f);
+        }

-void SetCalculateLocalSoftcoreSim( GpuLJ14Softcore* gpuLJ14Softcore)
-{
-    cudaError_t status;
+        p2                       = epsfac*qProd[ii];
+        p3                       = softcoreLJLambdaArray[ii];

-    status = cudaMemcpyToSymbol(feSim, &gpuLJ14Softcore->feSim, sizeof(cudaFreeEnergySimulationNonbonded14));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateLocalSoftcoreSim copy to cSim failed");
-}
+        (*psLJ14Parameter)[ii].x = p0;
+        (*psLJ14Parameter)[ii].y = p1;
+        (*psLJ14Parameter)[ii].z = p2;
+        (*psLJ14Parameter)[ii].w = p3;
+    }

-void GetCalculateLocalSoftcoreForcesSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: GetCalculateLocalSoftcoreForcesSim copy from cSim failed");
+    // logging info
+
+    if( gpu->log ){
+        (void) fprintf( gpu->log, "gpuSetLJ14SoftcoreParameters: number of 1-4 bonds=%5u\n", LJ14s );
+#ifdef PARAMETER_PRINT
+        unsigned int maxPrint = MAX_PARAMETER_PRINT;
+        for( unsigned int ii = 0; ii < LJ14s; ii++ ){
+            (void) fprintf( gpu->log, "    %5d [%5d %5d %5d %5d] %15.7e %15.7e %15.7e %15.7e\n",
+                            ii, (*psLJ14ID)[ii].x, (*psLJ14ID)[ii].y, (*psLJ14ID)[ii].z, (*psLJ14ID)[ii].w, 
+                            (*psLJ14Parameter)[ii].x, (*psLJ14Parameter)[ii].y,
+                            (*psLJ14Parameter)[ii].z/epsfac, (*psLJ14Parameter)[ii].w );
+            if( ii == maxPrint ){
+                (void) fprintf( gpu->log, "\n" );
+                ii = LJ14s - maxPrint;
+                if( ii < maxPrint )ii = maxPrint;
+            }    
+        }    
+        (void) fprintf( gpu->log, "\n" );
+#endif
+        (void) fflush( gpu->log );
+    }    
+
+    psLJ14ID->Upload();
+    psLJ14Parameter->Upload();
+
+    return;
 }

-#define USE_SOFTCORE_LJ
-#ifdef USE_SOFTCORE_LJ
-#include "kSoftcoreLJ.h"
-#endif
+#define DOT3(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z)

 __global__ void kCalculateLocalSoftcoreForces_kernel()
 {
    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    //Vectors* A       = &sV[threadIdx.x];

    float energy     = 0.0f;

-#if 0
-    while (pos < cSim.bond_offset)
-    {
-        if (pos < cSim.bonds)
-        {
-            int4   atom         = cSim.pBondID[pos];
-            float4 atomA        = cSim.pPosq[atom.x];
-            float4 atomB        = cSim.pPosq[atom.y];
-            float2 bond         = cSim.pBondParameter[pos];
-            float dx            = atomB.x - atomA.x;
-            float dy            = atomB.y - atomA.y;
-            float dz            = atomB.z - atomA.z;
-            float r2            = dx * dx + dy * dy + dz * dz;
-            float r             = sqrt(r2);
-            float deltaIdeal    = r - bond.x;
-/* E */     energy             += 0.5f * bond.y * deltaIdeal * deltaIdeal;
-            float dEdR          = bond.y * deltaIdeal;
-            dEdR                = (r > 0.0f) ? (dEdR / r) : 0.0f;
-//            printf("D: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f\n", dx, dy, dz, r, deltaIdeal, dEdR);
-            dx                 *= dEdR;
-            dy                 *= dEdR;
-            dz                 *= dEdR;
-            unsigned int offsetA                = atom.x + atom.z * cSim.stride;
-            unsigned int offsetB                = atom.y + atom.w * cSim.stride;
-            float4 forceA                       = cSim.pForce4[offsetA];
-            float4 forceB                       = cSim.pForce4[offsetB];
-            forceA.x                           += dx;
-            forceA.y                           += dy;
-            forceA.z                           += dz;
-            forceB.x                           -= dx;
-            forceB.y                           -= dy;
-            forceB.z                           -= dz;
-            cSim.pForce4[offsetA]               = forceA;
-            cSim.pForce4[offsetB]               = forceB;    
-        }
-        pos += blockDim.x * gridDim.x;
-    }
-
-    while (pos < cSim.bond_angle_offset)
+    if (feSim.nonbondedMethod == NO_CUTOFF)
    {
-        unsigned int pos1   = pos - cSim.bond_offset;
-        if (pos1 < cSim.bond_angles)
+        while (pos < feSim.LJ14_count)
        {
-            int4   atom1            = cSim.pBondAngleID1[pos1];  
-            float2 bond_angle       = cSim.pBondAngleParameter[pos1];
-            float4 a1               = cSim.pPosq[atom1.x];
-            float4 a2               = cSim.pPosq[atom1.y];
-            float4 a3               = cSim.pPosq[atom1.z];
-            A->v0.x                 = a2.x - a1.x;
-            A->v0.y                 = a2.y - a1.y;
-            A->v0.z                 = a2.z - a1.z;
-            A->v1.x                 = a2.x - a3.x;
-            A->v1.y                 = a2.y - a3.y;
-            A->v1.z                 = a2.z - a3.z;
-            float3 cp;
-            CROSS_PRODUCT(A->v0, A->v1, cp);
-            float rp                = DOT3(cp, cp); //cx * cx + cy * cy + cz * cz;
-            rp                      = max(sqrt(rp), 1.0e-06f);
-            float r21               = DOT3(A->v0, A->v0); // dx1 * dx1 + dy1 * dy1 + dz1 * dz1;
-            float r23               = DOT3(A->v1, A->v1); // dx2 * dx2 + dy2 * dy2 + dz2 * dz2;
-            float dot               = DOT3(A->v0, A->v1); // dx1 * dx2 + dy1 * dy2 + dz1 * dz2;
-            float cosine            = dot / sqrt(r21 * r23);
-
-            float angle_energy;
-/* E */     GETENERGYGIVENANGLECOSINE(cosine, bond_angle, angle_energy);
-            energy                 += 0.5f*angle_energy;
-
-            float dEdR;
-            GETPREFACTORSGIVENANGLECOSINE(cosine, bond_angle, dEdR);
-            //printf("%11.4f %11.4f\n", cosine, dEdR);
-            float termA             =  dEdR / (r21 * rp);
-            float termC             = -dEdR / (r23 * rp);
-            float3 c21;
-            float3 c23;
-            CROSS_PRODUCT(A->v0, cp, c21);
-            CROSS_PRODUCT(A->v1, cp, c23);
-            c21.x                  *= termA;
-            c21.y                  *= termA;
-            c21.z                  *= termA;
-            c23.x                  *= termC;
-            c23.y                  *= termC;
-            c23.z                  *= termC;
-            int2 atom2              = cSim.pBondAngleID2[pos1];
-            unsigned int offset     = atom1.x + atom1.w * cSim.stride;
-            float4 force            = cSim.pForce4[offset]; 
-            force.x                += c21.x;
-            force.y                += c21.y;
-            force.z                += c21.z;
-            cSim.pForce4[offset]    = force;
-            offset                  = atom1.y + atom2.x * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= (c21.x + c23.x);
-            force.y                -= (c21.y + c23.y);
-            force.z                -= (c21.z + c23.z);
-            cSim.pForce4[offset]    = force;
-            offset                  = atom1.z + atom2.y * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                += c23.x;
-            force.y                += c23.y;
-            force.z                += c23.z;
-            cSim.pForce4[offset]    = force;
-        }
-        pos += blockDim.x * gridDim.x;
-    }
+            int4 atom               = feSim.pLJ14ID[pos];
+            float4 LJ14             = feSim.pLJ14Parameter[pos];

-    while (pos < cSim.dihedral_offset)
-    {
-        unsigned int pos1 = pos - cSim.bond_angle_offset;
-        if (pos1 < cSim.dihedrals)
-        {
-            int4   atom1        = cSim.pDihedralID1[pos1];  
-            float4 atomA        = cSim.pPosq[atom1.x];
-            float4 atomB        = cSim.pPosq[atom1.y];
-            float4 atomC        = cSim.pPosq[atom1.z];
-            float4 atomD        = cSim.pPosq[atom1.w];            
-            A->v0.x             = atomA.x - atomB.x;
-            A->v0.y             = atomA.y - atomB.y;
-            A->v0.z             = atomA.z - atomB.z;
-            A->v1.x             = atomC.x - atomB.x;
-            A->v1.y             = atomC.y - atomB.y;
-            A->v1.z             = atomC.z - atomB.z;
-            A->v2.x             = atomC.x - atomD.x;
-            A->v2.y             = atomC.y - atomD.y;
-            A->v2.z             = atomC.z - atomD.z; 
-            float3 cp0, cp1;
-            float dihedralAngle;
-            GETDIHEDRALANGLEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle);
-            float4 dihedral         = cSim.pDihedralParameter[pos1];
-            float deltaAngle        = dihedral.z * dihedralAngle - (dihedral.y * PI / 180.0f);
-
-	    // ATTENTION: This section leads to a divergent deltaAngle values wrt
-	    // forces and energies. We separate the case dihedral.z = n = 0, which
-	    // is treated by the calculation of energies via a harmonic potential
-/* E */     if (dihedral.z) energy += dihedral.x * (1.0f + cos(deltaAngle));
-/* E */     else
-	    {
-		float deltaAngle    = dihedralAngle - dihedral.y;
-		if (deltaAngle < -PI) deltaAngle += 2.0f * PI;
-		else if (deltaAngle > PI) deltaAngle -= 2.0f * PI;
-                energy             += dihedral.x * deltaAngle * deltaAngle;
-	    }
-
-            float sinDeltaAngle     = sin(deltaAngle);
-            float dEdAngle          = -dihedral.x * dihedral.z * sinDeltaAngle;
-            float normCross1        = DOT3(cp0, cp0);
-            float normBC            = sqrt(DOT3(A->v1, A->v1));
-            float4 ff;
-            ff.x                    = (-dEdAngle * normBC) / normCross1;
-            float normCross2        = DOT3(cp1, cp1);
-            ff.w                    = (dEdAngle * normBC) / normCross2;
-            float dp                = 1.0f / DOT3(A->v1, A->v1);
-            ff.y                    = DOT3(A->v0, A->v1) * dp;
-            ff.z                    = DOT3(A->v2, A->v1) * dp;
-            int4  atom2             = cSim.pDihedralID2[pos1];   
-            float3 internalF0;
-            float3 internalF3;
-            float3 s;
-            
-//            printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);  
-            unsigned int offset                 = atom1.x + atom2.x * cSim.stride;
-            float4 force                        = cSim.pForce4[offset]; 
-            internalF0.x                        = ff.x * cp0.x; 
-            force.x                            += internalF0.x;
-            internalF0.y                        = ff.x * cp0.y;
-            force.y                            += internalF0.y;
-            internalF0.z                        = ff.x * cp0.z;       
-            force.z                            += internalF0.z;
-            cSim.pForce4[offset]                = force;
-            
-            //printf("%4d - 0: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
-            offset                              = atom1.w + atom2.w * cSim.stride;
-            force                               = cSim.pForce4[offset];
-            internalF3.x                        = ff.w * cp1.x;
-            force.x                            += internalF3.x;
-            internalF3.y                        = ff.w * cp1.y;
-            force.y                            += internalF3.y;
-            internalF3.z                        = ff.w * cp1.z;
-            force.z                            += internalF3.z;
-            cSim.pForce4[offset]                = force;
-            
-           // printf("%4d - 3: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
-            s.x                                 = ff.y * internalF0.x - ff.z * internalF3.x;   
-            s.y                                 = ff.y * internalF0.y - ff.z * internalF3.y;  
-            s.z                                 = ff.y * internalF0.z - ff.z * internalF3.z;        
-            offset                              = atom1.y + atom2.y * cSim.stride;
-            force                               = cSim.pForce4[offset];
-            force.x                            += -internalF0.x + s.x;
-            force.y                            += -internalF0.y + s.y;
-            force.z                            += -internalF0.z + s.z;
-            cSim.pForce4[offset]                = force;
-            
-            //printf("%4d - 1: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
-            offset                              = atom1.z + atom2.z * cSim.stride;
-            force                               = cSim.pForce4[offset];
-            force.x                            += -internalF3.x - s.x;
-            force.y                            += -internalF3.y - s.y;
-            force.z                            += -internalF3.z - s.z;
-            cSim.pForce4[offset]                = force;
-            //printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
-        }
-        pos += blockDim.x * gridDim.x;
-    }
-
-    // Ryckaert Bellemans dihedrals
-    while (pos < cSim.rb_dihedral_offset)
-    {
-        unsigned int pos1 = pos - cSim.dihedral_offset;
-        if (pos1 < cSim.rb_dihedrals)
-        {
-            int4   atom1        = cSim.pRbDihedralID1[pos1];
-            float4 atomA        = cSim.pPosq[atom1.x];
-            float4 atomB        = cSim.pPosq[atom1.y];
-            float4 atomC        = cSim.pPosq[atom1.z];
-            float4 atomD        = cSim.pPosq[atom1.w];
-            A->v0.x             = atomA.x - atomB.x;
-            A->v0.y             = atomA.y - atomB.y;
-            A->v0.z             = atomA.z - atomB.z;
-            A->v1.x             = atomC.x - atomB.x;
-            A->v1.y             = atomC.y - atomB.y;
-            A->v1.z             = atomC.z - atomB.z;
-            A->v2.x             = atomC.x - atomD.x;
-            A->v2.y             = atomC.y - atomD.y;
-            A->v2.z             = atomC.z - atomD.z;
-            float3 cp0, cp1;
-            float dihedralAngle, cosPhi;
-      //      printf("%4d - 0 : %9.4f %9.4f %9.4f\n", pos1, A->v0.x, A->v0.y, A->v0.z);
-      //      printf("%4d - 1 : %9.4f %9.4f %9.4f\n", pos1, A->v1.x, A->v1.y, A->v1.z);
-      //      printf("%4d - 2 : %9.4f %9.4f %9.4f\n", pos1, A->v2.x, A->v2.y, A->v2.z);
-            GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle, cosPhi);
-            if (dihedralAngle < 0.0f )
-            {
-                dihedralAngle += PI;
-            }
-            else
-            {
-                dihedralAngle -= PI;
-            }
-            cosPhi                  = -cosPhi;
-         //   printf("%4d: %9.4f %9.4f\n", pos1, dihedralAngle, cosPhi);
-            float4 dihedral1        = cSim.pRbDihedralParameter1[pos1];
-            float2 dihedral2        = cSim.pRbDihedralParameter2[pos1];
-            float cosFactor         = cosPhi;
-            float dEdAngle          = -dihedral1.y;
-
-/* E */     float rb_energy         = dihedral1.x;
-            rb_energy              += dihedral1.y * cosFactor;
-        //    printf("%4d - 1: %9.4f %9.4f\n", pos1, dEdAngle, 1.0f);
-            dEdAngle               -= 2.0f * dihedral1.z * cosFactor;
-       //     printf("%4d - 2: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
-            cosFactor              *= cosPhi;
-            dEdAngle               -= 3.0f * dihedral1.w * cosFactor;
-            rb_energy              += dihedral1.z * cosFactor;
-    //       printf("%4d - 3: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
-            cosFactor              *= cosPhi;
-            dEdAngle               -= 4.0f * dihedral2.x * cosFactor;
-            rb_energy              += dihedral1.w * cosFactor;
-  //         printf("%4d - 4: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
-            cosFactor              *= cosPhi;
-            dEdAngle               -= 5.0f * dihedral2.y * cosFactor;
-            rb_energy              += dihedral2.x * cosFactor;
-            rb_energy              += dihedral2.y * cosFactor * cosPhi;
-/* E */     energy                 += rb_energy;
- //           printf("%4d - 5: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
-            dEdAngle               *= sin(dihedralAngle);
-//            printf("%4d - f: %9.4f\n", pos1, dEdAngle);
-
-            float normCross1        = DOT3(cp0, cp0);
-            float normBC            = sqrt(DOT3(A->v1, A->v1));
-            float4 ff;
-            ff.x                    = (-dEdAngle * normBC) / normCross1;
-            float normCross2        = DOT3(cp1, cp1);
-            ff.w                    = (dEdAngle * normBC) / normCross2;
-            float dp                = 1.0f / DOT3(A->v1, A->v1);
-            ff.y                    = DOT3(A->v0, A->v1) * dp;
-            ff.z                    = DOT3(A->v2, A->v1) * dp;
-            int4  atom2             = cSim.pRbDihedralID2[pos1];
-            float3 internalF0;
-            float3 internalF3;
-            float3 s;
-
-//            printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
-            unsigned int offset                 = atom1.x + atom2.x * cSim.stride;
-            float4 force                        = cSim.pForce4[offset];
-            internalF0.x                        = ff.x * cp0.x;
-            force.x                            += internalF0.x;
-            internalF0.y                        = ff.x * cp0.y;
-            force.y                            += internalF0.y;
-            internalF0.z                        = ff.x * cp0.z;
-            force.z                            += internalF0.z;
-            cSim.pForce4[offset]                = force;
-
- //           printf("%4d - 0: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
-            offset                              = atom1.w + atom2.w * cSim.stride;
-            force                               = cSim.pForce4[offset];
-            internalF3.x                        = ff.w * cp1.x;
-            force.x                            += internalF3.x;
-            internalF3.y                        = ff.w * cp1.y;
-            force.y                            += internalF3.y;
-            internalF3.z                        = ff.w * cp1.z;
-            force.z                            += internalF3.z;
-            cSim.pForce4[offset]                = force;
-
-   //         printf("%4d - 3: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
-            s.x                                 = ff.y * internalF0.x - ff.z * internalF3.x;
-            s.y                                 = ff.y * internalF0.y - ff.z * internalF3.y;
-            s.z                                 = ff.y * internalF0.z - ff.z * internalF3.z;
-            offset                              = atom1.y + atom2.y * cSim.stride;
-            force                               = cSim.pForce4[offset];
-            force.x                            += -internalF0.x + s.x;
-            force.y                            += -internalF0.y + s.y;
-            force.z                            += -internalF0.z + s.z;
-            cSim.pForce4[offset]                = force;
-     //       printf("%4d - 1: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
-            offset                              = atom1.z + atom2.z * cSim.stride;
-            force                               = cSim.pForce4[offset];
-            force.x                            += -internalF3.x - s.x;
-            force.y                            += -internalF3.y - s.y;
-            force.z                            += -internalF3.z - s.z;
-            cSim.pForce4[offset]                = force;
-     //       printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
-        }         
-        pos += blockDim.x * gridDim.x;
-    }   
-#endif
-
-    if (cSim.nonbondedMethod == NO_CUTOFF)
-    {
-        while (pos < feSim.LJ14_offset)
-        {
-            //unsigned int pos1       = pos - feSim.rb_dihedral_offset;
-            unsigned int pos1       = pos;
-            if (pos1 < feSim.LJ14s)
-            {
-                int4 atom               = feSim.pLJ14ID[pos1];
-                float4 LJ14             = feSim.pLJ14Parameter[pos1];
            float4 a1               = cSim.pPosq[atom.x];
            float4 a2               = cSim.pPosq[atom.y];
+
            float3 d;
            d.x                     = a1.x - a2.x;
            d.y                     = a1.y - a2.y;
            d.z                     = a1.z - a2.z;
+
            float r2                = DOT3(d, d);
            float inverseR          = 1.0f / sqrt(r2);
 #ifdef USE_SOFTCORE_LJ
@@ -503,21 +172,14 @@ __global__ void kCalculateLocalSoftcoreForces_kernel()
            forceB.z               -= d.z;
            cSim.pForce4[offsetA]   = forceA;
            cSim.pForce4[offsetB]   = forceB;
-            }
            pos                    += blockDim.x * gridDim.x;
        }
-    }
-    else if (cSim.nonbondedMethod == CUTOFF)
-    {
+
+    } else if (feSim.nonbondedMethod == CUTOFF) {
        float LJ14_energy;
-        while (pos < feSim.LJ14_offset)
-        {
-            //unsigned int pos1       = pos - feSim.rb_dihedral_offset;
-            unsigned int pos1       = pos;
-            if (pos1 < feSim.LJ14s)
-            {
-                int4 atom               = feSim.pLJ14ID[pos1];
-                float4 LJ14             = feSim.pLJ14Parameter[pos1];
+        while (pos < feSim.LJ14_count ){
+            int4 atom               = feSim.pLJ14ID[pos];
+            float4 LJ14             = feSim.pLJ14Parameter[pos];
            float4 a1               = cSim.pPosq[atom.x];
            float4 a2               = cSim.pPosq[atom.y];
            float3 d;
@@ -533,19 +195,16 @@ __global__ void kCalculateLocalSoftcoreForces_kernel()
            sig2                   *= sig2;
            float sig6              = sig2 * sig2 * sig2;
            float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;                
-                /* E */
            LJ14_energy             = LJ14.x * (sig6 - 1.0f) * sig6;
 #endif
            LJ14_energy            += LJ14.z * (inverseR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
            dEdR                   += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
            dEdR                   *= inverseR * inverseR;
-                if (r2 > cSim.nonbondedCutoffSqr)
+            if (r2 > feSim.nonbondedCutoffSqr)
            {                   
                dEdR = 0.0f;
-                    /* E */
                LJ14_energy = 0.0f;
            }
-                /* E */
            energy                 += LJ14_energy;
 
            unsigned int offsetA    = atom.x + atom.z * cSim.stride;
@@ -563,21 +222,13 @@ __global__ void kCalculateLocalSoftcoreForces_kernel()
            forceB.z               -= d.z;
            cSim.pForce4[offsetA]   = forceA;
            cSim.pForce4[offsetB]   = forceB;
-            }
            pos                    += blockDim.x * gridDim.x;
        }
-    }
-    else if (cSim.nonbondedMethod == PERIODIC)
-    {
+    } else if (feSim.nonbondedMethod == PERIODIC ){
        float LJ14_energy;
-        while (pos < feSim.LJ14_offset)
-        {
-            //unsigned int pos1       = pos - feSim.rb_dihedral_offset;
-            unsigned int pos1       = pos;
-            if (pos1 < feSim.LJ14s)
-            {
-                int4 atom               = feSim.pLJ14ID[pos1];
-                float4 LJ14             = feSim.pLJ14Parameter[pos1];
+        while (pos < feSim.LJ14_count ){
+            int4 atom               = feSim.pLJ14ID[pos];
+            float4 LJ14             = feSim.pLJ14Parameter[pos];
            float4 a1               = cSim.pPosq[atom.x];
            float4 a2               = cSim.pPosq[atom.y];
            float3 d;
@@ -596,20 +247,17 @@ __global__ void kCalculateLocalSoftcoreForces_kernel()
            sig2                   *= sig2;
            float sig6              = sig2 * sig2 * sig2;
            float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
-                /* E */
            LJ14_energy             = LJ14.x * (sig6 - 1.0f) * sig6;
 #endif
            LJ14_energy            += LJ14.z * (inverseR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);

            dEdR                   += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
            dEdR                   *= inverseR * inverseR;
-                if (r2 > cSim.nonbondedCutoffSqr)
+            if (r2 > feSim.nonbondedCutoffSqr)
            {
                dEdR = 0.0f;
-                    /* E */
                LJ14_energy = 0.0f;
            }
-                /* E */
            energy                 += LJ14_energy;

            unsigned int offsetA    = atom.x + atom.z * cSim.stride;
@@ -627,87 +275,15 @@ __global__ void kCalculateLocalSoftcoreForces_kernel()
            forceB.z               -= d.z;
            cSim.pForce4[offsetA]   = forceA;
            cSim.pForce4[offsetB]   = forceB;
-            }
            pos                    += blockDim.x * gridDim.x;
        }
    }
    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy;
 }

-extern "C"
-GpuLJ14Softcore* gpuSetLJ14SoftcoreParameters(gpuContext gpu, float epsfac, float fudge, const std::vector<int>& atom1, const std::vector<int>& atom2,
-                                              const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q1,
-                                              const std::vector<float>& q2, const std::vector<float>& softcoreLJLambdaArray)
-{
-    int LJ14s                                   = atom1.size();
-    float scale                                 = epsfac * fudge;
-
-    GpuLJ14Softcore* gpuLJ14Softcore            = new GpuLJ14Softcore();
-    gpuLJ14Softcore->feSim.LJ14s                = LJ14s;
-
-    CUDAStream<int4>* psLJ14ID                  = new CUDAStream<int4>(LJ14s, 1, "LJ14SoftcoreID");
-    gpuLJ14Softcore->psLJ14SoftcoreID           = psLJ14ID;
-    gpuLJ14Softcore->feSim.pLJ14ID              = psLJ14ID->_pDevStream[0];
-
-    CUDAStream<float4>* psLJ14Parameter         = new CUDAStream<float4>(LJ14s, 1, "LJ14SoftcoreParameter");
-    gpuLJ14Softcore->psLJ14SoftcoreParameter    = psLJ14Parameter;
-    gpuLJ14Softcore->feSim.pLJ14Parameter       = psLJ14Parameter->_pDevStream[0];
-    gpuLJ14Softcore->feSim.LJ14_offset          = LJ14s;
-
-    for (int i = 0; i < LJ14s; i++)
-    {
-        (*psLJ14ID)[i].x          = atom1[i];
-        (*psLJ14ID)[i].y          = atom2[i];
-        psLJ14ID->_pSysData[i].z  = gpu->pOutputBufferCounter[psLJ14ID->_pSysData[i].x]++;
-        psLJ14ID->_pSysData[i].w  = gpu->pOutputBufferCounter[psLJ14ID->_pSysData[i].y]++;
-        float p0, p1, p2, p3;
-        if (c12[i] == 0.0f)
-        {
-            p0 = 0.0f;
-            p1 = 1.0f;
-        }
-        else
-        {
-            p0 = c6[i] * c6[i] / c12[i];
-            p1 = pow(c12[i] / c6[i], 1.0f / 6.0f);
-        }
-        p2 = scale * q1[i] * q2[i];
-        p3 = softcoreLJLambdaArray[i];
-        (*psLJ14Parameter)[i].x = p0;
-        (*psLJ14Parameter)[i].y = p1;
-        (*psLJ14Parameter)[i].z = p2;
-        (*psLJ14Parameter)[i].w = p3;
-    }
-#if (DUMP_PARAMETERS == 1)
-        cout << 
-            i << " " <<
-            (*psLJ14ID)[i].x << " " <<
-            (*psLJ14ID)[i].y << " " <<
-            (*psLJ14ID)[i].z << " " <<
-            (*psLJ14ID)[i].w << " " <<
-            (*psLJ14Parameter)[i].x << " " <<
-            (*psLJ14Parameter)[i].y << " " <<
-            (*psLJ14Parameter)[i].z << " " <<
-            (*psLJ14Parameter)[i].w << " " <<
-            p0 << " " << 
-            p1 << " " << 
-            p2 << " " << 
-            p3 << " " << 
-            endl;
-#endif
-    psLJ14ID->Upload();
-    psLJ14Parameter->Upload();
-    SetCalculateLocalSoftcoreSim( gpuLJ14Softcore );
-
-    return gpuLJ14Softcore;
-}
-
-void kCalculateLocalSoftcoreForces(gpuContext gpu)
+void kCalculateLocalSoftcoreForces( freeEnergyGpuContext freeEnergyGpuContext )
 {
-  //  printf("kCalculateLocalForces\n");
-//    fprintf( stderr, "kCalculateLocalSoftcoreForces blks=%u localForces_threads_per_block=%u szVector=%u total=%u\n", gpu->sim.blocks, gpu->sim.localForces_threads_per_block, sizeof(Vectors),
-//             gpu->sim.localForces_threads_per_block * sizeof(Vectors) ); fflush( stderr );
-
+    gpuContext gpu = freeEnergyGpuContext->gpuContext;
    kCalculateLocalSoftcoreForces_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block, gpu->sim.localForces_threads_per_block * sizeof(Vectors)>>>();
    LAUNCHERROR("kCalculateLocalSoftcoreForces");
 }

--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateNonbondedSoftcore.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateNonbondedSoftcore.cu
@@ -24,161 +24,171 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

-#include "GpuNonbondedSoftcore.h"
+#include "freeEnergyGpuTypes.h"
 #include "GpuFreeEnergyCudaKernels.h"
 #include "openmm/OpenMMException.h"
-#include <algorithm>
+#include <sstream>

-// structure containing array of softcore lambdas
-
-struct cudaFreeEnergySimulationNonBonded {
-    float* pParticleSoftCoreLJLambda;
-};
-//struct cudaFreeEnergySimulationNonBonded feSim;
+#define PARAMETER_PRINT 0
+#define MAX_PARAMETER_PRINT 10

 // device handles

 static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaFreeEnergySimulationNonBonded feSimDev;
+static __constant__ cudaFreeEnergyGmxSimulation feSimDev;

 // write address of structs to devices

-void SetCalculateCDLJSoftcoreGpuSim( gpuContext gpu )
-{
+void SetCalculateCDLJSoftcoreGpuSim( freeEnergyGpuContext freeEnergyGpu ){
    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
+    status = cudaMemcpyToSymbol(cSim, &freeEnergyGpu->gpuContext->sim, sizeof(cudaGmxSimulation));
    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateCDLJSoftcoreGpuSim copy to cSim failed");

-    //(void) fprintf( stderr, "SetCalculateCDLJSoftcoreGpuSim gpu=%p cSim=%p sizeof=%u\n", gpu, &gpu->sim, sizeof(cudaGmxSimulation) ); fflush( stderr );
+    status = cudaMemcpyToSymbol( feSimDev, &freeEnergyGpu->freeEnergySim, sizeof(cudaFreeEnergyGmxSimulation));
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateCDLJSoftcoreGpuSim copy to feSimDev failed");
 }

-void SetCalculateCDLJSoftcoreSupplementarySim( float* gpuParticleSoftCoreLJLambda)
+extern "C"
+void freeEnergyGpuSetPeriodicBoxSize( freeEnergyGpuContext freeEnergyGpu, float xsize, float ysize, float zsize)
 {
-    cudaError_t status;
-    struct cudaFreeEnergySimulationNonBonded feSim;
+    freeEnergyGpu->freeEnergySim.periodicBoxSizeX    = xsize;
+    freeEnergyGpu->freeEnergySim.periodicBoxSizeY    = ysize;
+    freeEnergyGpu->freeEnergySim.periodicBoxSizeZ    = zsize;

-    feSim.pParticleSoftCoreLJLambda = gpuParticleSoftCoreLJLambda;
-    status = cudaMemcpyToSymbol(feSimDev, &feSim, sizeof(cudaFreeEnergySimulationNonBonded));
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateCDLJSoftcoreSupplementarySim");
+    freeEnergyGpu->freeEnergySim.invPeriodicBoxSizeX = 1.0f/xsize;
+    freeEnergyGpu->freeEnergySim.invPeriodicBoxSizeY = 1.0f/ysize;
+    freeEnergyGpu->freeEnergySim.invPeriodicBoxSizeZ = 1.0f/zsize;

-    //(void) fprintf( stderr, "SetCalculateCDLJSoftcoreSupplementarySim\n" );
-}
+    freeEnergyGpu->freeEnergySim.recipBoxSizeX       = 2.0f*PI/freeEnergyGpu->freeEnergySim.periodicBoxSizeX;
+    freeEnergyGpu->freeEnergySim.recipBoxSizeY       = 2.0f*PI/freeEnergyGpu->freeEnergySim.periodicBoxSizeY;
+    freeEnergyGpu->freeEnergySim.recipBoxSizeZ       = 2.0f*PI/freeEnergyGpu->freeEnergySim.periodicBoxSizeZ;

-void GetCalculateCDLJSoftcoreForcesSim(float* gpuParticleSoftCoreLJLambda)
-{
-//    cudaError_t status;
-//    status = cudaMemcpyFromSymbol(gpuParticleSoftCoreLJLambda, particleSoftCoreLJLambdaDev, sizeof(float*));
-//    RTERROR(status, "cudaMemcpyFromSymbol: GetCalculateCDLJSoftcoreForcesSim failed");
-}
+    freeEnergyGpu->freeEnergySim.cellVolume          = freeEnergyGpu->freeEnergySim.periodicBoxSizeX*freeEnergyGpu->freeEnergySim.periodicBoxSizeY*freeEnergyGpu->freeEnergySim.periodicBoxSizeZ;

-// create, initialize and entrt SoftCoreLJLambda values
-// return handle to GpuNonbondedSoftcore object
-
-static void setSoftcoreExclusions(gpuContext gpu, const std::vector<std::vector<int> >& exclusions) {
-    if (gpu->exclusions.size() > 0) { 
-        bool ok = (exclusions.size() == gpu->exclusions.size());
-        for (unsigned int i = 0; i < exclusions.size() && ok; i++) {
-            if (exclusions[i].size() != gpu->exclusions[i].size())
-                ok = false;
-            else {
-                for (unsigned int j = 0; j < exclusions[i].size(); j++) 
-                    if (find(gpu->exclusions[i].begin(), gpu->exclusions[i].end(), exclusions[i][j]) == gpu->exclusions[i].end())
-                        ok = false;
-            }
-        }
-        if (!ok)
-            throw OpenMM::OpenMMException("All nonbonded forces must have identical sets of exceptions");
-    }    
-    gpu->exclusions = exclusions;
+    gpuSetPeriodicBoxSize( freeEnergyGpu->gpuContext, xsize, ysize, zsize );
 }

 extern "C"
-GpuNonbondedSoftcore* gpuSetNonbondedSoftcoreParameters(gpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6,
+void gpuSetNonbondedSoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6,
                                        const std::vector<float>& c12, const std::vector<float>& q,
                                        const std::vector<float>& softcoreLJLambdaArray, const std::vector<char>& symbol,
-                                                        const std::vector<std::vector<int> >& exclusions, CudaNonbondedMethod method)
-{
+                                        const std::vector<std::vector<int> >& exclusions, CudaFreeEnergyNonbondedMethod method,
+                                        float cutoffDistance, float solventDielectric ){
+
    unsigned int numberOfParticles                         = c6.size();
-    gpu->sim.epsfac                    = epsfac;
-    gpu->sim.nonbondedMethod           = method;
-    if (numberOfParticles > 0)
-        setSoftcoreExclusions(gpu, exclusions);
+    gpuContext gpu                                         = freeEnergyGpu->gpuContext;
+    int paddedNumberOfAtoms                                = gpu->sim.paddedNumberOfAtoms;

-    // create gpuNonbondedSoftcore
+    // sanity checks

-    GpuNonbondedSoftcore* gpuNonbondedSoftcore = new GpuNonbondedSoftcore();
-    gpuNonbondedSoftcore->initializeParticleSoftCoreLJLambda( numberOfParticles );
-    float minSoftcore                          = 1.0e+10;
-    for (unsigned int i = 0; i < numberOfParticles; i++)
-    {
-            float p0               = q[i];
+    if( paddedNumberOfAtoms < 1 ){
+        std::stringstream msg;
+        msg << "gpuSetNonbondedSoftcoreParameters: number of padded atoms=" <<  gpu->sim.paddedNumberOfAtoms << " is less than 1.";
+        throw OpenMM::OpenMMException( msg.str() );
+    }

-            // track min softcore value
+    if( freeEnergyGpu->gpuContext->sim.atoms != numberOfParticles  ){
+        std::stringstream msg;
+        msg << "gpuSetNonbondedSoftcoreParameters: number of atoms in gpuContext does not match input count: " << freeEnergyGpu->gpuContext->sim.atoms << " " << numberOfParticles << ".";
+        throw OpenMM::OpenMMException( msg.str() );
+    }
+
+    freeEnergyGpu->freeEnergySim.epsfac                    = epsfac;
+    freeEnergyGpu->freeEnergySim.nonbondedMethod           = method;
+
+    freeEnergyGpu->freeEnergySim.nonbondedCutoff           = cutoffDistance;
+    freeEnergyGpu->freeEnergySim.nonbondedCutoffSqr        = cutoffDistance*cutoffDistance;
+
+    gpu->sim.nonbondedCutoff                               = cutoffDistance;
+    gpu->sim.nonbondedCutoffSqr                            = cutoffDistance*cutoffDistance;

-            float softcoreLJLambda = softcoreLJLambdaArray[i];
-            if( minSoftcore > softcoreLJLambda ){
-                minSoftcore = softcoreLJLambda;
+    if( cutoffDistance > 0.0f ){
+        freeEnergyGpu->freeEnergySim.reactionFieldK        = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0f)/(2.0f*solventDielectric+1.0f);
+        freeEnergyGpu->freeEnergySim.reactionFieldC        = (1.0f / cutoffDistance)*(3.0f*solventDielectric)/(2.0f*solventDielectric+1.0f);
+        gpu->sim.reactionFieldK                            = freeEnergyGpu->freeEnergySim.reactionFieldK;
+        gpu->sim.reactionFieldC                            = freeEnergyGpu->freeEnergySim.reactionFieldC;
+    } else {
+        freeEnergyGpu->freeEnergySim.reactionFieldK        = 0.0f;
+        freeEnergyGpu->freeEnergySim.reactionFieldC        = 0.0f;
    }
-            gpuNonbondedSoftcore->setParticleSoftCoreLJLambda( i, softcoreLJLambda );

-            float p1 = 0.5f, p2 = 0.0f;               
-            if ((c6[i] > 0.0f) && (c12[i] > 0.0f))
-            {
-                p1 = 0.5f * powf(c12[i] / c6[i], 1.0f / 6.0f);
-                p2 = c6[i] * sqrtf(1.0f / c12[i]);
+    setExclusions( gpu, exclusions );
+ 
+    // parameters
+ 
+    freeEnergyGpu->psSigEps4                               = new CUDAStream<float4>( paddedNumberOfAtoms, 1, "freeEnergyGpuSigEps4");
+    freeEnergyGpu->freeEnergySim.pSigEps4                  = freeEnergyGpu->psSigEps4->_pDevData;
+
+    for( unsigned int ii = 0; ii < numberOfParticles; ii++ ){
+
+        float p1 = 0.5f;
+        float p2 = 0.0f;               
+
+        if( (c6[ii] > 0.0f) && (c12[ii] > 0.0f) ){
+            p1 = 0.5f * powf(c12[ii] / c6[ii], 1.0f / 6.0f);
+            p2 = c6[ii] * sqrtf(1.0f / c12[ii]);
        }
+/*
            if (symbol.size() > 0)
-                gpu->pAtomSymbol[i] = symbol[i];
+                freeEnergyGpu->pAtomSymbol[ii] = symbol[ii];
+*/

-            (*gpu->psPosq4)[i].w          = p0;
-            (*gpu->psSigEps2)[i].x        = p1;
-            (*gpu->psSigEps2)[i].y        = p2;
+        (*freeEnergyGpu->psSigEps4)[ii].x        = p1;
+        (*freeEnergyGpu->psSigEps4)[ii].y        = p2;
+        (*freeEnergyGpu->psSigEps4)[ii].z        = softcoreLJLambdaArray[ii];
+        (*freeEnergyGpu->psSigEps4)[ii].w        = q[ii];
    }
-    gpuNonbondedSoftcore->setSoftCoreLJLambda( minSoftcore );

    // Dummy out extra atom data
-    for (unsigned int i = numberOfParticles; i < gpu->sim.paddedNumberOfAtoms; i++)
-    {
-        (*gpu->psPosq4)[i].x                = 100000.0f + i * 10.0f;
-        (*gpu->psPosq4)[i].y                = 100000.0f + i * 10.0f;
-        (*gpu->psPosq4)[i].z                = 100000.0f + i * 10.0f;
-        (*gpu->psPosq4)[i].w                = 0.0f;
-        (*gpu->psSigEps2)[i].x              = 0.0f;
-        (*gpu->psSigEps2)[i].y              = 0.0f;
+
+    for( unsigned int ii = numberOfParticles; ii < paddedNumberOfAtoms; ii++ ){
+
+        (*freeEnergyGpu->psSigEps4)[ii].x              = 1.0f;
+        (*freeEnergyGpu->psSigEps4)[ii].y              = 0.0f;
+        (*freeEnergyGpu->psSigEps4)[ii].z              = 0.0f;
+        (*freeEnergyGpu->psSigEps4)[ii].w              = 0.0f;
+
+        (*gpu->psPosq4)[ii].x                          = 100000.0f + ii * 10.0f;
+        (*gpu->psPosq4)[ii].y                          = 100000.0f + ii * 10.0f;
+        (*gpu->psPosq4)[ii].z                          = 100000.0f + ii * 10.0f;
+        (*gpu->psPosq4)[ii].w                          = 0.0f;
+
    }

-#undef DUMP_PARAMETERS
-#define DUMP_PARAMETERS 0
-#if (DUMP_PARAMETERS == 1)
-    (void) fprintf( stderr,"gpuSetNonbondedSoftcoreParameters: %5u epsfac=%14.7e method=%d\n", numberOfParticles, gpu->sim.paddedNumberOfAtoms, epsfac, method );
-    int maxPrint = 31;
-    for (unsigned int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++){
-        (void) fprintf( stderr,"%6u x[%14.7e %14.7e %14.7e %14.7e] sig[%14.7e %14.7e]\n",
-                        ii, (*gpu->psPosq4)[ii].x, (*gpu->psPosq4)[ii].y, (*gpu->psPosq4)[ii].z, (*gpu->psPosq4)[ii].w,
-                        (*gpu->psSigEps2)[ii].x, (*gpu->psSigEps2)[ii].y );
-        if( ii == maxPrint && ii < gpu->sim.paddedNumberOfAtoms - maxPrint ){
-           ii = gpu->sim.paddedNumberOfAtoms - maxPrint;
+    if( freeEnergyGpu->log ){
+        (void) fprintf( freeEnergyGpu->log,"freeEnergyGpuSetNonbondedSoftcoreParameters: %5u padded=%u epsfac=%14.7e method=%d cutoffDistance=%9.2f solventDielectric=%9.2f\n",
+                        numberOfParticles, freeEnergyGpu->gpuContext->sim.paddedNumberOfAtoms, epsfac, method, cutoffDistance, solventDielectric );
+#ifdef PARAMETER_PRINT
+        int maxPrint = MAX_PARAMETER_PRINT;
+        for (unsigned int ii = 0; ii < numberOfParticles; ii++){
+            (void) fprintf( freeEnergyGpu->log,"%6u sig[%14.7e %14.7e] lambda=%10.3f q=%10.3f\n",
+                            ii, 
+                            (*freeEnergyGpu->psSigEps4)[ii].x, (*freeEnergyGpu->psSigEps4)[ii].y, (*freeEnergyGpu->psSigEps4)[ii].z, (*freeEnergyGpu->psSigEps4)[ii].w );
+            if( ii == maxPrint && ii < freeEnergyGpu->gpuContext->sim.paddedNumberOfAtoms - maxPrint ){
+               ii = numberOfParticles - maxPrint;
+            }
+        }
+        unsigned int offset = paddedNumberOfAtoms - maxPrint;
+        if( offset > 0 ){
+            if( offset > numberOfParticles ){
+                (void) fprintf( freeEnergyGpu->log,"Dummy padded entries\n" );
+                for (unsigned int ii = offset; ii < paddedNumberOfAtoms; ii++){
+                    (void) fprintf( freeEnergyGpu->log,"%6u sig[%14.7e %14.7e] lambda=%10.3f q=%10.3f\n",
+                                    ii, 
+                                    (*freeEnergyGpu->psSigEps4)[ii].x, (*freeEnergyGpu->psSigEps4)[ii].y, (*freeEnergyGpu->psSigEps4)[ii].z, (*freeEnergyGpu->psSigEps4)[ii].w );
+                }
            }
        }
 #endif
+        (void) fflush( freeEnergyGpu->log );
+    }
 
    // upload data to board

-    gpuNonbondedSoftcore->upload( gpu );
-
+    freeEnergyGpu->psSigEps4->Upload();
    gpu->psPosq4->Upload();
-    gpu->psSigEps2->Upload();

-    return gpuNonbondedSoftcore;
-}
-
-// delete gpuNonbondedSoftcore
-
-extern "C"
-void gpuDeleteNonbondedSoftcoreParameters( void* gpuNonbondedSoftcore)
-{
-    GpuNonbondedSoftcore* internalGNonbondedSoftcore = static_cast<GpuNonbondedSoftcore*>(gpuNonbondedSoftcore);
-    delete internalGNonbondedSoftcore;
+    return;
 }

 extern "C"
@@ -202,46 +212,25 @@ struct Atom {
    float fz;
 };

-#if 0
-texture<float, 1, cudaReadModeElementType> tabulatedErfcRef;
-
-__device__ float fastErfc(float r)
-{
-    float normalized = cSim.tabulatedErfcScale*r;
-    int index = (int) normalized;
-    float fract2 = normalized-index;
-    float fract1 = 1.0f-fract2;
-    return fract1*tex1Dfetch(tabulatedErfcRef, index) + fract2*tex1Dfetch(tabulatedErfcRef, index+1);
-}
-
-// Include versions of the kernels for N^2 calculations.
-
-#define METHOD_NAME(a, b) a##N2##b
-#include "kCalculateNonbondedSoftcore.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateNonbondedSoftcore.h"
+// Include versions of the kernels for N^2 calculations with softcore LJ.

+#define USE_SOFTCORE_LJ
+#ifdef USE_SOFTCORE_LJ
+#include "kSoftcoreLJ.h"
 #endif

-// Include versions of the kernels for N^2 calculations with softcore LJ.
-
 #undef METHOD_NAME
 #define METHOD_NAME(a, b) a##N2SoftcoreLJ##b
 #undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_SOFTCORE_LJ
 #include "kCalculateNonbondedSoftcore.h"

 #define USE_OUTPUT_BUFFER_PER_WARP
 #undef METHOD_NAME
 #define METHOD_NAME(a, b) a##N2SoftcoreLJByWarp##b
 #include "kCalculateNonbondedSoftcore.h"
-#undef USE_SOFTCORE_LJ

 // Include versions of the kernels with cutoffs.

-#if 0
 #undef METHOD_NAME
 #undef USE_OUTPUT_BUFFER_PER_WARP
 #define USE_CUTOFF
@@ -266,74 +255,29 @@ __device__ float fastErfc(float r)
 #define METHOD_NAME(a, b) a##PeriodicByWarp##b
 #include "kCalculateNonbondedSoftcore.h"

-// Include versions of the kernels for Ewald
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_PERIODIC
-#define USE_EWALD
-#define METHOD_NAME(a, b) a##Ewald##b
-#include "kCalculateNonbondedSoftcore.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##EwaldByWarp##b
-#include "kCalculateNonbondedSoftcore.h"
-
-// Reciprocal Space Ewald summation is in a separate kernel
-#include "kCalculateCDLJEwaldFastReciprocal.h"
-
-void kCalculatePME(gpuContext gpu);
-#endif
-
-void kCalculateCDLJSoftcoreForces(gpuContext gpu )
+void kCalculateCDLJSoftcoreForces( freeEnergyGpuContext freeEnergyGpu )
 {

-    //printf("kCalculateCDLJCutoffForces %d\n", gpu->sim.nonbondedMethod); fflush( stdout );
-    switch (gpu->sim.nonbondedMethod)
+    gpuContext gpu = freeEnergyGpu->gpuContext;
+    // (void) fprintf( stderr,"kCalculateCDLJCutoffForces %d warp=%u nonbond_blocks=%u nonbond_threads_per_block=%u rfK=%15.7e rfC=%15.7e\n", freeEnergyGpu->freeEnergySim.nonbondedMethod,
+    //                 gpu->bOutputBufferPerWarp, gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, gpu->sim.reactionFieldK, gpu->sim.reactionFieldC); fflush( stderr );
+
+    switch (freeEnergyGpu->freeEnergySim.nonbondedMethod)
    {
-        case NO_CUTOFF:
+        case FREE_ENERGY_NO_CUTOFF:
+
           if (gpu->bOutputBufferPerWarp)
                kCalculateCDLJSoftcoreN2SoftcoreLJByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                         sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
           else
                   kCalculateCDLJSoftcoreN2SoftcoreLJForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                           sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit );
-//(gpu->sim.pWorkUnit, gpuNonbondedSoftcore->getGpuParticleSoftCoreLJLambda());
            LAUNCHERROR("kCalculateCDLJSoftcoreN2Forces");

-#if 0
-int maxPrint = 31; 
-gpu->psWorkUnit->Download();
-fprintf( stderr, "kCalculateCDLJSoftcoreForces: bOutputBufferPerWarp=%u blks=%u th/blk=%u wu=%u %u shrd=%u\n", gpu->bOutputBufferPerWarp,
-                 gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, gpu->sim.workUnits, gpu->psWorkUnit->_pSysStream[0][0],
-        sizeof(Atom)*gpu->sim.nonbond_threads_per_block );
-
-               gpu->psPosq4->Download();
-
-                (void) fprintf( stderr, "\nkCalculateGBVISoftcoreBornSum: pre BornSum %s Born radii & params\n",
-                               (gpu->bIncludeGBVI ? "GBVI" : "Obc") );
-                for( int ii = 0; ii < gpu->natoms; ii++ ){
-                   (void) fprintf( stderr, "%6d bSum=%14.6e param[%14.6e %14.6e %14.6e] x[%14.6f %14.6f %14.6f %14.6f]\n",
-                                   ii,
-                                   gpu->psBornSum->_pSysStream[0][ii],
-                                   gpu->psGBVIData->_pSysStream[0][ii].x,
-                                   gpu->psGBVIData->_pSysStream[0][ii].y,
-                                   gpu->psGBVIData->_pSysStream[0][ii].z,
-                                   gpu->psPosq4->_pSysStream[0][ii].x, gpu->psPosq4->_pSysStream[0][ii].y,
-                                   gpu->psPosq4->_pSysStream[0][ii].z, gpu->psPosq4->_pSysStream[0][ii].w
-                                 );
-                   if( (ii == maxPrint) && ( ii < (gpu->natoms - maxPrint)) ){
-                      ii = gpu->natoms - maxPrint;
-                   }
-                }
-
-#endif
-#undef GBVI
+            break;

+        case FREE_ENERGY_CUTOFF:

-            break;
-#if 0
-        case CUTOFF:
            kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
            LAUNCHERROR("kFindBlockBoundsCutoff");
            kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
@@ -341,44 +285,19 @@ fprintf( stderr, "kCalculateCDLJSoftcoreForces: bOutputBufferPerWarp=%u blks=%u
            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
            kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-#if 0
-    static int iteration = 0;
-    if (iteration >= 0)
-    {
-        gpu->psInteractingWorkUnit->Download();
-        gpu->psInteractionCount->Download();
-/*
-    unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    unsigned int numWorkUnits = cSim.pInteractionCount[0];
-    unsigned int pos = warp*numWorkUnits/totalWarps;
-    unsigned int end = (warp+1)*numWorkUnits/totalWarps;
-*/
- 
-        printf("# Post kCalculateCDLJCutoffForces %d atoms warps=%d cnt=%u bOutputBufferPerWarp=%d zC=%d\n", 
-                gpu->natoms, ((gpu->sim.nonbond_blocks*gpu->sim.nonbond_threads_per_block)/GRID),
-                gpu->psInteractionCount->_pSysStream[0][0], gpu->bOutputBufferPerWarp,
-                (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block);
-        fflush( stdout );
-        for (int i = 0; i < gpu->psInteractingWorkUnit->_stride; i++)
-        {
-            printf("%5d %u\n", i, gpu->psInteractingWorkUnit->_pSysStream[0][i] );
-            fflush( stdout );
-        }
-    }
-    iteration++;
-#endif
-

            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                kCalculateCDLJSoftcoreCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
            else
-                kCalculateCDLJCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                kCalculateCDLJSoftcoreCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            LAUNCHERROR("kCalculateCDLJCutoffForces");
+
+            LAUNCHERROR("kCalculateCDLJSoftcoreCutoffForces");
            break;
-        case PERIODIC:
+
+        case FREE_ENERGY_PERIODIC:
+
            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
            LAUNCHERROR("kFindBlockBoundsPeriodic");
            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
@@ -387,123 +306,17 @@ fprintf( stderr, "kCalculateCDLJSoftcoreForces: bOutputBufferPerWarp=%u blks=%u
            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                kCalculateCDLJSoftcorePeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
            else
-                kCalculateCDLJPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                kCalculateCDLJSoftcorePeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            LAUNCHERROR("kCalculateCDLJPeriodicForces");
+            LAUNCHERROR("kCalculateCDLJSoftcorePeriodicForces");
            break;
-        case EWALD:
-        case PARTICLE_MESH_EWALD:
-            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
-            LAUNCHERROR("kFindBlockBoundsPeriodic");
-            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
-            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
-            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
-            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-            cudaBindTexture(NULL, &tabulatedErfcRef, gpu->psTabulatedErfc->_pDevData, &channelDesc, gpu->psTabulatedErfc->_length*sizeof(float));
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJEwaldByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateCDLJEwaldForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            LAUNCHERROR("kCalculateCDLJEwaldForces");
-            if (gpu->sim.nonbondedMethod == EWALD)
-            {
-                kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
-                LAUNCHERROR("kCalculateEwaldFastCosSinSums");
-                kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
-                LAUNCHERROR("kCalculateEwaldFastForces");
-            }
-            else
-                kCalculatePME(gpu);
-#endif
-    }
-}

-void kPrintForces(gpuContext gpu, std::string idString, int call )
-{
- //   printf("kReduceForces\n");
-#define GBVI_DEBUG 4
-#if ( GBVI_DEBUG == 4 )
-
-                gpu->psBornRadii->Download();
-                gpu->psObcData->Download();
-                gpu->psObcChain->Download();
-                gpu->psBornForce->Download();
-                gpu->psForce4->Download();
-                gpu->psPosq4->Download();
-                int maxPrint = 30; 
-int   nanHit       = 0;
-int   targetIndex  = -852;
-float maxForce     = 3.0e+04;
-float maxPosition  = 2.0e+02;
-                for( int ii = 0; ii < gpu->natoms; ii++ ){
-
-int   hit  = 0;
-float dist = sqrtf( gpu->psPosq4->_pSysStream[0][ii].x*gpu->psPosq4->_pSysStream[0][ii].x + 
-                    gpu->psPosq4->_pSysStream[0][ii].y*gpu->psPosq4->_pSysStream[0][ii].y +
-                    gpu->psPosq4->_pSysStream[0][ii].z*gpu->psPosq4->_pSysStream[0][ii].z );
-
-if( fabs( gpu->psForce4->_pSysStream[0][ii].x ) > maxForce ||
-    fabs( gpu->psForce4->_pSysStream[0][ii].y ) > maxForce ||
-    fabs( gpu->psForce4->_pSysStream[0][ii].z ) > maxForce ||
-//    gpu->psBornRadii->_pSysStream[0][ii] <= 0.0            ||
-    dist > maxPosition                                     ||
-    isnan( gpu->psForce4->_pSysStream[0][ii].x )           ||
-    isnan( gpu->psForce4->_pSysStream[0][ii].y )           ||
-    isnan( gpu->psForce4->_pSysStream[0][ii].z )  ){  
-   hit = 1;
-} else {
-   hit = 0;
-}
-if( ii == targetIndex || ii == (targetIndex+1) || ii == (targetIndex+2) )hit = 1;
-if( isnan( gpu->psBornForce->_pSysStream[0][ii] ) ||
-    isnan( gpu->psBornRadii->_pSysStream[0][ii] ) ||
-    isnan( gpu->psObcChain->_pSysStream[0][ii]  ) ||
-    isnan( gpu->psForce4->_pSysStream[0][ii].x  )  ||
-    isnan( gpu->psForce4->_pSysStream[0][ii].y  )  ||
-    isnan( gpu->psForce4->_pSysStream[0][ii].z  )  ){  
-   hit    = 1;
-   nanHit = 1;
-}
+        default:
+            throw OpenMM::OpenMMException( "Nonbonded softcore method not recognized." );

-                if( hit || ii < maxPrint || ii >= (gpu->natoms - maxPrint) ){
-                //if( hit ){
-                    static int firstHit = 1;
-                    if( firstHit ){
-                       firstHit = 0;
-                       (void) fprintf( stderr, "\nkPrintForces: %d [r, scl q] b[r/c/f] f[] x[] Born radii/force (%p %p)\n", call,
-                                       gpu->psBornForce, gpu->psBornForce->_pDevStream[0] );
-                    }
-                    (void) fprintf( stderr, "%6d [%8.3f %8.3f %8.3f] b[%13.6e %13.6e %13.6e] f[%13.6e %13.6e %13.6e] x[%13.6e %13.6e %13.6e] %10.3e %s %s %d\n",
-                                    ii, 
-                                    (gpu->psObcData->_pSysStream[0][ii].x + 0.009f),
-                                    (gpu->psObcData->_pSysStream[0][ii].y/gpu->psObcData->_pSysStream[0][ii].x),
-                                    gpu->psPosq4->_pSysStream[0][ii].w,
-                                    gpu->psBornRadii->_pSysStream[0][ii],
-                                    gpu->psObcChain->_pSysStream[0][ii],
-                                    gpu->psBornForce->_pSysStream[0][ii],
-
-                                    gpu->psForce4->_pSysStream[0][ii].x,
-                                    gpu->psForce4->_pSysStream[0][ii].y,
-                                    gpu->psForce4->_pSysStream[0][ii].z,
-
-                                    gpu->psPosq4->_pSysStream[0][ii].x,
-                                    gpu->psPosq4->_pSysStream[0][ii].y,
-                                    gpu->psPosq4->_pSysStream[0][ii].z, dist,
-
-                                    (hit ? "XXXXXXX" : "" ), idString.c_str(), call );
    }
-                }
-                (void) fflush( stderr );
-//                if( nanHit )exit(0);
-#endif
-
 }

--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateNonbondedSoftcore.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateNonbondedSoftcore.h
@@ -25,20 +25,20 @@
 * -------------------------------------------------------------------------- */

 /**
- * This file contains the kernels for evalauating nonbonded forces.  It is included
- * several times in kCalculateCDLJForces.cu with different #defines to generate
+ * This file contains the kernels for evaluating nonbonded softcore forces.  It is included
+ * several times in kCalculateNonbondedSoftcore.cu with different #defines to generate
 * different versions of the kernels.
 */

-#ifdef USE_SOFTCORE_LJ
-#include "kSoftcoreLJ.h"
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
 #endif
-
-/* Cuda compiler on Windows does not recognized "static const float" values */
-#define LOCAL_HACK_PI 3.1415926535897932384626433832795f
-
-//__global__ void METHOD_NAME(kCalculateCDLJSoftcore, Forces_kernel)(unsigned int* workUnit, float* softCoreLJLambdaArray)
-__global__ void METHOD_NAME(kCalculateCDLJSoftcore, Forces_kernel)(unsigned int* workUnit )
+void METHOD_NAME(kCalculateCDLJSoftcore, Forces_kernel)(unsigned int* workUnit )
 {
    extern __shared__ Atom sA[];
    unsigned int totalWarps   = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
@@ -52,10 +52,6 @@ __global__ void METHOD_NAME(kCalculateCDLJSoftcore, Forces_kernel)(unsigned int*
    float3* tempBuffer        = (float3*) &sA[cSim.nonbond_threads_per_block];
 #endif

-#ifdef USE_EWALD
-    const float TWO_OVER_SQRT_PI = 2.0f/sqrt(LOCAL_HACK_PI);
-#endif
-
    unsigned int lasty = 0xFFFFFFFF;
    while (pos < end)
    {
@@ -75,16 +71,17 @@ __global__ void METHOD_NAME(kCalculateCDLJSoftcore, Forces_kernel)(unsigned int*
        float sig;
        float eps;
        float dEdR;
+
        unsigned int tgx             = threadIdx.x & (GRID - 1);
        unsigned int tbx             = threadIdx.x - tgx;
        unsigned int tj              = tgx;
+
        Atom* psA                    = &sA[tbx];
        unsigned int i               = x + tgx;
+
        apos                         = cSim.pPosq[i];
-        float2 a                     = cSim.pAttr[i];
-        //float softCoreLJLambda       = cSim.pSoftCoreLJLambda[i];
-        //float softCoreLJLambda       = softCoreLJLambdaArray[i];
-        float softCoreLJLambda       = feSimDev.pParticleSoftCoreLJLambda[i];
+        float4 a                     = feSimDev.pSigEps4[i];
+        float softCoreLJLambda       = a.z;
        af.x                         = 0.0f;
        af.y                         = 0.0f;
        af.z                         = 0.0f;
@@ -94,11 +91,11 @@ __global__ void METHOD_NAME(kCalculateCDLJSoftcore, Forces_kernel)(unsigned int*
            sA[threadIdx.x].x                     = apos.x;
            sA[threadIdx.x].y                     = apos.y;
            sA[threadIdx.x].z                     = apos.z;
-            sA[threadIdx.x].q                     = apos.w;
+            sA[threadIdx.x].q                     = a.w;
            sA[threadIdx.x].sig                   = a.x;
            sA[threadIdx.x].eps                   = a.y;
-            sA[threadIdx.x].softCoreLJLambda      = softCoreLJLambda;
-            apos.w                               *= cSim.epsfac;
+            sA[threadIdx.x].softCoreLJLambda      = a.z;
+            a.w                                  *= cSim.epsfac;
            if (!bExclusionFlag)
            {
                for (unsigned int j = 0; j < GRID; j++)
@@ -126,33 +123,20 @@ __global__ void METHOD_NAME(kCalculateCDLJSoftcore, Forces_kernel)(unsigned int*
 #endif

 #ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                    float r         = sqrt(r2);
-                    float alphaR    = cSim.alphaEwald * r;
-                    float erfcAlphaR = fastErfc(alphaR);
-                    dEdR           += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI );
-		              /* E */
-                    CDLJ_energy    += apos.w * psA[j].q * invR * erfcAlphaR;
-    #else
-                    dEdR           += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-		              /* E */
-		              CDLJ_energy    += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-    #endif
+                    dEdR           += a.w * psA[j].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
+		              CDLJ_energy    += a.w * psA[j].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
 #else
-                    dEdR           += apos.w * psA[j].q * invR;
-		              /* E */
-		              CDLJ_energy    += apos.w * psA[j].q * invR;
+                    dEdR           += a.w * psA[j].q * invR;
+		              CDLJ_energy    += a.w * psA[j].q * invR;
 #endif
                    dEdR           *= invR * invR;
 #ifdef USE_CUTOFF
                    if (r2 > cSim.nonbondedCutoffSqr)
                    {
                        dEdR        = 0.0f;
-                        /* E */
                        CDLJ_energy = 0.0f;
                    }
 #endif
-		              /* E */
 		              energy         += 0.5f*CDLJ_energy;
                    dx             *= dEdR;
                    dy             *= dEdR;
@@ -161,12 +145,13 @@ __global__ void METHOD_NAME(kCalculateCDLJSoftcore, Forces_kernel)(unsigned int*
                    af.y           -= dy;
                    af.z           -= dz;
                }
-            }
-            else  // bExclusion
-            {
+
+            } else  {
+
                unsigned int xi   = x>>GRIDBITS;
                unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
+
                for (unsigned int j = 0; j < GRID; j++)
                {
                    dx              = psA[j].x - apos.x;
@@ -188,53 +173,27 @@ __global__ void METHOD_NAME(kCalculateCDLJSoftcore, Forces_kernel)(unsigned int*
                    sig2           *= sig2;
                    float sig6      = sig2 * sig2 * sig2;
                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
-		              /* E */
                    CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
 #endif

 #ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                    float r         = sqrt(r2);
-                    float alphaR    = cSim.alphaEwald * r;
-                    float erfcAlphaR = fastErfc(alphaR);
-                    dEdR           += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-                    /* E */
-		              CDLJ_energy    += apos.w * psA[j].q * invR * erfcAlphaR;
-                    bool needCorrection = !(excl & 0x1) && x+tgx != y+j && x+tgx < cSim.atoms && y+j < cSim.atoms;
-                    if (needCorrection)
-                    {   
-                        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
-
-                        dEdR        = -apos.w * psA[j].q * invR * ((1.0f-erfcAlphaR) - alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-                        CDLJ_energy = -apos.w * psA[j].q * invR * (1.0f-erfcAlphaR);
-                    }   
-
-    #else
-                    dEdR           += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-                    /* E */
-		              CDLJ_energy    += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-    #endif
+                    dEdR           += a.w * psA[j].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
+		              CDLJ_energy    += a.w * psA[j].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
 #else
-                    dEdR           += apos.w * psA[j].q * invR;
-                    /* E */
-		              CDLJ_energy    += apos.w * psA[j].q * invR;
+                    dEdR           += a.w * psA[j].q * invR;
+		              CDLJ_energy    += a.w * psA[j].q * invR;
 #endif
                    dEdR           *= invR * invR;
 #ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                    if ((!(excl & 0x1) && !needCorrection) || r2 > cSim.nonbondedCutoffSqr)
-    #else
                    if (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr)
-    #endif
 #else
                    if (!(excl & 0x1))
 #endif
                    {
                        dEdR = 0.0f;
-                			/* E */
 		                  CDLJ_energy  = 0.0f;
                    }
-		              /* E */
+
                    energy         += 0.5f*CDLJ_energy;
                    dx             *= dEdR;
                    dy             *= dEdR;
@@ -263,22 +222,23 @@ __global__ void METHOD_NAME(kCalculateCDLJSoftcore, Forces_kernel)(unsigned int*
            unsigned int offset                 = x + tgx + (x >> GRIDBITS) * cSim.stride;
            cSim.pForce4[offset]                = of;
 #endif
-        }
-        else        // 100% utilization
-        {
+
+        } else {
+
            // Read fixed atom data into registers and GRF
            if (lasty != y)
            {
                unsigned int j                   = y + tgx;
                float4 temp                      = cSim.pPosq[j];
-                float2 temp1                     = cSim.pAttr[j];
+                //float2 temp1                     = cSim.pAttr[j];
+                float4 temp1                     = feSimDev.pSigEps4[j];
                //float  temp3                     = cSim.pSoftCoreLJLambda[j];
                //float  temp3                     = softCoreLJLambdaArray[j];
-                float temp3                      = feSimDev.pParticleSoftCoreLJLambda[j];
+                float temp3                      = temp1.z;
                sA[threadIdx.x].x                = temp.x;
                sA[threadIdx.x].y                = temp.y;
                sA[threadIdx.x].z                = temp.z;
-                sA[threadIdx.x].q                = temp.w;
+                sA[threadIdx.x].q                = temp1.w;
                sA[threadIdx.x].sig              = temp1.x;
                sA[threadIdx.x].eps              = temp1.y;
                sA[threadIdx.x].softCoreLJLambda = temp3;
@@ -286,7 +246,7 @@ __global__ void METHOD_NAME(kCalculateCDLJSoftcore, Forces_kernel)(unsigned int*
            sA[threadIdx.x].fx      = 0.0f;
            sA[threadIdx.x].fy      = 0.0f;
            sA[threadIdx.x].fz      = 0.0f;
-            apos.w                 *= cSim.epsfac;
+            a.w                *= cSim.epsfac;
            if (!bExclusionFlag)
            {
 #ifdef USE_CUTOFF
@@ -324,33 +284,21 @@ __global__ void METHOD_NAME(kCalculateCDLJSoftcore, Forces_kernel)(unsigned int*
 			               CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
 #endif
 #ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                        float r         = sqrt(r2);
-                        float alphaR    = cSim.alphaEwald * r;
-                        float erfcAlphaR = fastErfc(alphaR);
-                        dEdR           += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-                        /* E */
-                        CDLJ_energy    += apos.w * psA[tj].q * invR * erfcAlphaR;
-    #else
-                        dEdR           += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-			/* E */
-                        CDLJ_energy    += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-    #endif
+                        dEdR           += a.w * psA[tj].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
+                        CDLJ_energy    += a.w * psA[tj].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
 #else
-                        dEdR           += apos.w * psA[tj].q * invR;
-                        /* E */
-                        CDLJ_energy    += apos.w * psA[tj].q * invR;
+                        dEdR           += a.w * psA[tj].q * invR;
+                        CDLJ_energy    += a.w * psA[tj].q * invR;
 #endif
                        dEdR           *= invR * invR;
 #ifdef USE_CUTOFF
                        if (r2 > cSim.nonbondedCutoffSqr)
                        {
                            dEdR = 0.0f;
-			                   /* E */
       			             CDLJ_energy = 0.0f;
                        }
 #endif
-			               /* E */
+
 			               energy         += CDLJ_energy;
                        dx             *= dEdR;
                        dy             *= dEdR;
@@ -392,36 +340,23 @@ __global__ void METHOD_NAME(kCalculateCDLJSoftcore, Forces_kernel)(unsigned int*
                            sig2           *= sig2;
                            float sig6      = sig2 * sig2 * sig2;
                            dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
-			                   /* E */
                			    CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
 #endif
 #ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                            float r         = sqrt(r2);
-                            float alphaR    = cSim.alphaEwald * r;
-                            float erfcAlphaR = fastErfc(alphaR);
-                            dEdR           += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-                            CDLJ_energy    += apos.w * psA[j].q * invR * erfcAlphaR;
-    #else
-                            dEdR           += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-                            /* E */
-                            CDLJ_energy    += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-    #endif
+                            dEdR           += a.w * psA[j].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
+                            CDLJ_energy    += a.w * psA[j].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
 #else
-                            dEdR           += apos.w * psA[j].q * invR;
-                            /* E */
-                            CDLJ_energy    += apos.w * psA[j].q * invR;
+                            dEdR           += a.w * psA[j].q * invR;
+                            CDLJ_energy    += a.w * psA[j].q * invR;
 #endif
                            dEdR           *= invR * invR;
 #ifdef USE_CUTOFF
                            if (r2 > cSim.nonbondedCutoffSqr)
                            {
                                dEdR = 0.0f;
-				                    /* E */
 				                    CDLJ_energy = 0.0f;
                            }
 #endif
-			                   /* E */
 			                   energy         += CDLJ_energy;
                            dx             *= dEdR;
                            dy             *= dEdR;
@@ -499,52 +434,27 @@ __global__ void METHOD_NAME(kCalculateCDLJSoftcore, Forces_kernel)(unsigned int*
                    sig2           *= sig2;
                    float sig6      = sig2 * sig2 * sig2;
                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
-		              /* E */
 		              CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
 #endif

 #ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                    float r         = sqrt(r2);
-                    float alphaR    = cSim.alphaEwald * r;
-                    float erfcAlphaR = fastErfc(alphaR);
-                    dEdR           += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-                    /* E */
-                    CDLJ_energy    += apos.w * psA[tj].q * invR * erfcAlphaR;
-                    bool needCorrection = !(excl & 0x1) && x+tgx != y+tj && x+tgx < cSim.atoms && y+tj < cSim.atoms;
-                    if (needCorrection)
-                    {
-                        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
-
-                        dEdR        = -apos.w * psA[tj].q * invR * ((1.0f-erfcAlphaR) - alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-                        CDLJ_energy = -apos.w * psA[tj].q * invR * (1.0f-erfcAlphaR);
-                    }
-    #else
-                    dEdR           += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-                    /* E */
-	                 CDLJ_energy    += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-    #endif
+                    dEdR           += a.w * psA[tj].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
+	                 CDLJ_energy    += a.w * psA[tj].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
 #else
-                    dEdR           += apos.w * psA[tj].q * invR;
-                    /* E */
-                    CDLJ_energy    += apos.w * psA[tj].q * invR;
+                    dEdR           += a.w * psA[tj].q * invR;
+                    CDLJ_energy    += a.w * psA[tj].q * invR;
 #endif
                    dEdR           *= invR * invR;
 #ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                    if ((!(excl & 0x1) && !needCorrection) || r2 > cSim.nonbondedCutoffSqr)
-    #else
                    if (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr)
-    #endif
 #else
                    if (!(excl & 0x1))
 #endif
                    {
                        dEdR = 0.0f;			
-                        /* E */
 	                     CDLJ_energy  = 0.0f;
                    }
-             	    /* E */
+
 		              energy         += CDLJ_energy;
                    dx             *= dEdR;
                    dy             *= dEdR;

--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreBornSum.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreBornSum.cu
@@ -25,11 +25,16 @@
 * -------------------------------------------------------------------------- */

 #include "gputypes.h"
+#include "freeEnergyGpuTypes.h"
+#include "GpuFreeEnergyCudaKernels.h"
 #include "kernels/cudaKernels.h"
-#include "GpuObcGbsaSoftcore.h"
+#include "openmm/OpenMMException.h"

 #include <cuda.h>
-#include <string>
+
+#define PARAMETER_PRINT 0
+#define MAX_PARAMETER_PRINT 10
+//#define DEBUG

 struct Atom {
    float x;
@@ -41,38 +46,17 @@ struct Atom {
    float polarScaleData;
 };

-struct cudaFreeEnergySimulationObcGbsaSoftcore {
-    float* pNonPolarScalingFactors;
-};
-struct cudaFreeEnergySimulationObcGbsaSoftcore gbsaSim;
-
 static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaFreeEnergySimulationObcGbsaSoftcore gbsaSimDev;
+static __constant__ cudaFreeEnergyGmxSimulation gbsaSimDev;

-extern "C"
-void SetCalculateObcGbsaSoftcoreBornSumSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "SetCalculateObcGbsaSoftcoreBornSumSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-
-extern "C"
-void SetCalculateObcGbsaSoftcoreNonPolarScalingFactorsSim( float* nonPolarScalingFactors )
+extern "C" void SetCalculateObcGbsaSoftcoreBornSumSim( freeEnergyGpuContext freeEnergyGpu)
 {
    cudaError_t status;
-    gbsaSim.pNonPolarScalingFactors = nonPolarScalingFactors;
-    status                          = cudaMemcpyToSymbol(gbsaSimDev, &gbsaSim, sizeof(cudaFreeEnergySimulationObcGbsaSoftcore));
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateObcGbsaSoftcoreNonPolarScalingFactorsSim");
+    status = cudaMemcpyToSymbol( cSim, &freeEnergyGpu->gpuContext->sim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateObcGbsaSoftcoreBornSumSim copy to cSim failed.");

-    //(void) fprintf( stderr, "In SetCalculateObcGbsaSoftcoreNonPolarScalingFactorsSim\n" );
-}
-
-void GetCalculateObcGbsaSoftcoreBornSumSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "GetCalculateObcGbsaSoftcoreBornSumSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+    status = cudaMemcpyToSymbol( gbsaSimDev, &freeEnergyGpu->freeEnergySim, sizeof(cudaFreeEnergyGmxSimulation));
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateObcGbsaSoftcoreBornSumSim copy to gbsaSimDev failed.");
 }

 __global__ void kClearObcGbsaSoftcoreBornSum_kernel()
@@ -112,6 +96,7 @@ __global__ void kReduceObcGbsaSoftcoreBornForces_kernel()
 {
    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
    float energy     = 0.0f;
+
    while (pos < cSim.atoms)
    {
        float bornRadius         = cSim.pBornRadii[pos];
@@ -153,7 +138,6 @@ __global__ void kReduceObcGbsaSoftcoreBornForces_kernel()
        float saTerm       = nonPolarScaleData*cSim.surfaceAreaFactor * r * r * ratio6;
        totalForce        += saTerm / bornRadius; // 1.102 == Temp mysterious fudge factor, FIX FIX FIX

-        /* E */
        energy            += saTerm;

        totalForce        *= bornRadius * bornRadius * obcChain;
@@ -162,14 +146,13 @@ __global__ void kReduceObcGbsaSoftcoreBornForces_kernel()
        *pFt               = totalForce;
        pos               += gridDim.x * blockDim.x;
    }
-    /* E */
+
    // correct for surface area factor of -6
    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f;
 }


-void kReduceObcGbsaSoftcoreBornForces(gpuContext gpu)
-{
+void kReduceObcGbsaSoftcoreBornForces( gpuContext gpu ){

    kReduceObcGbsaSoftcoreBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
    LAUNCHERROR("kReduceObcGbsaSoftcoreBornForces");
@@ -187,7 +170,6 @@ void kReduceObcGbsaSoftcoreBornForces(gpuContext gpu)

 // Include versions of the kernels with cutoffs.

-#if 0
 #undef METHOD_NAME
 #undef USE_OUTPUT_BUFFER_PER_WARP
 #define USE_CUTOFF
@@ -209,7 +191,6 @@ void kReduceObcGbsaSoftcoreBornForces(gpuContext gpu)
 #undef METHOD_NAME
 #define METHOD_NAME(a, b) a##PeriodicByWarp##b
 #include "kCalculateObcGbsaSoftcoreBornSum.h"
-#endif

 __global__ void kReduceObcGbsaSoftcoreBornSum_kernel()
 {
@@ -222,10 +203,8 @@ __global__ void kReduceObcGbsaSoftcoreBornSum_kernel()
        float2 atom = cSim.pObcData[pos];
        
        // Get summed Born data
-        for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
-        {
+        for( int i = 0; i < cSim.nonbondOutputBuffers; i++ ){
            sum += *pSt;
-       //     printf("%4d %4d A: %9.4f\n", pos, i, *pSt);
            pSt += cSim.stride;
        }
        
@@ -257,7 +236,7 @@ void kReduceObcGbsaSoftcoreBornSum(gpuContext gpu)
 /** 
 * Initialize parameters for Cuda Obc softcore
 * 
- * @param gpu                  gpu context
+ * @param freeEnergyGpu        freeEnergyGpu context
 * @param innerDielectric      solute dielectric
 * @param solventDielectric    solvent dielectric
 * @param radius               intrinsic Born radii
@@ -268,7 +247,7 @@ void kReduceObcGbsaSoftcoreBornSum(gpuContext gpu)
 */

 extern "C"
-GpuObcGbsaSoftcore* gpuSetObcSoftcoreParameters(gpuContext gpu, float innerDielectric, float solventDielectric, float nonPolarPrefactor,
+void  gpuSetObcSoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float innerDielectric, float solventDielectric, float nonPolarPrefactor,
                                   const std::vector<float>& radius, const std::vector<float>& scale,
                                   const std::vector<float>& charge, const std::vector<float>& nonPolarScalingFactors)
 {
@@ -281,97 +260,252 @@ GpuObcGbsaSoftcore* gpuSetObcSoftcoreParameters(gpuContext gpu, float innerDiele

 // ---------------------------------------------------------------------------------------

-    unsigned int atoms                     = radius.size();
+    unsigned int numberOfParticles                       = radius.size();
+    gpuContext gpu                                       = freeEnergyGpu->gpuContext;

    // initialize parameters

-//    gpu->bIncludeGBSA = true;
-    GpuObcGbsaSoftcore* gpuObcGbsaSoftcore = new GpuObcGbsaSoftcore();
-    gpuObcGbsaSoftcore->initializeNonPolarScalingFactors( gpu->sim.paddedNumberOfAtoms );
+    freeEnergyGpu->psNonPolarScalingFactors              = new CUDAStream<float>( gpu->sim.paddedNumberOfAtoms, 1, "ObcSoftcoreNonPolarScaling");
+    freeEnergyGpu->freeEnergySim.pNonPolarScalingFactors = freeEnergyGpu->psNonPolarScalingFactors->_pDevData;

    gpu->sim.surfaceAreaFactor                           =  -6.0f*PI*4.0f*nonPolarPrefactor;
-    for (unsigned int i = 0; i < atoms; i++)
-    {
-            (*gpu->psObcData)[i].x = radius[i] - dielectricOffset;
-            (*gpu->psObcData)[i].y = scale[i] * (*gpu->psObcData)[i].x;
-            (*gpu->psPosq4)[i].w   = charge[i];
-            gpuObcGbsaSoftcore->setNonPolarScalingFactors( i, nonPolarScalingFactors[i] );
+    gpu->sim.preFactor                                   = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;

+    for( unsigned int ii = 0; ii < numberOfParticles; ii++ ){
+        (*gpu->psObcData)[ii].x                        = radius[ii] - dielectricOffset;
+        (*gpu->psObcData)[ii].y                        = scale[ii] * (*gpu->psObcData)[ii].x;
+        (*gpu->psPosq4)[ii].w                          = charge[ii];
+        (*freeEnergyGpu->psNonPolarScalingFactors)[ii] = nonPolarScalingFactors[ii];
    }

    // diagnostics
-#define DUMP_PARAMETERS 0
-#if (DUMP_PARAMETERS == 1)
-    (void) fprintf( stderr, "%s %u %u\n", methodName.c_str(), gpu->natoms, gpu->sim.paddedNumberOfAtoms );
-    for (unsigned int i = 0; i < atoms; i++)
-    {
-       (void) fprintf( stderr, "%6u %13.6e %13.6e %8.3f %8.3f\n", i,  (*gpu->psObcData)[i].x, (*gpu->psObcData)[i].y, (*gpu->psPosq4)[i].w , nonPolarScalingFactors[i] );
+
+    if( freeEnergyGpu->log ){
+        (void) fprintf( freeEnergyGpu->log, "%s %u %u\n", methodName.c_str(), gpu->natoms, gpu->sim.paddedNumberOfAtoms );
+        (void) fprintf( freeEnergyGpu->log, "surfaceAreaFactor=%15.7e preFactor=%15.7e\n", gpu->sim.surfaceAreaFactor, gpu->sim.preFactor);
+#ifdef PARAMETER_PRINT
+        int maxPrint = MAX_PARAMETER_PRINT;
+        for( unsigned int ii = 0; ii < numberOfParticles; ii++ ){
+            (void) fprintf( freeEnergyGpu->log, "%6u %13.6e %13.6e %8.3f %8.3f\n", ii, 
+                            (*gpu->psObcData)[ii].x, (*gpu->psObcData)[ii].y, (*gpu->psPosq4)[ii].w, (*freeEnergyGpu->psNonPolarScalingFactors)[ii] );
+             if( ii == maxPrint ){
+                ii = numberOfParticles - maxPrint;
+                if( ii < maxPrint )ii = maxPrint;
+            }
        }
 #endif
+    }

    // dummy out extra atom data

-    for (unsigned int i = gpu->natoms; i < gpu->sim.paddedNumberOfAtoms; i++)
-    {
-        (*gpu->psBornRadii)[i]     = 0.2f;
-        (*gpu->psObcData)[i].x     = 0.01f;
-        (*gpu->psObcData)[i].y     = 0.01f;
+    for (unsigned int ii = gpu->natoms; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
+        (*gpu->psObcData)[ii].x                         = 0.01f;
+        (*gpu->psObcData)[ii].y                         = 0.01f;
+        (*freeEnergyGpu->psNonPolarScalingFactors)[ii]  = 0.0f;
    }

    // load data to board

-    gpuObcGbsaSoftcore->upload( gpu );
-    gpu->psBornRadii->Upload();
    gpu->psObcData->Upload();
    gpu->psPosq4->Upload();
+    freeEnergyGpu->psNonPolarScalingFactors->Upload();

-    gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
+    return;
+}
+
+void kPrintObcGbsaSoftcore( freeEnergyGpuContext freeEnergyGpu, std::string callId, int call, FILE* log){
+
+    gpuContext gpu = freeEnergyGpu->gpuContext;
+    int maxPrint   = gpu->natoms;
+
+    (void) fprintf( log, "kPrintObcGbsaSoftcore %s %d\n", callId.c_str(), call );
+
+    gpu->psObcData->Download();
+    gpu->psBornRadii->Download();
+    gpu->psBornForce->Download();
+    gpu->psPosq4->Download();
+    freeEnergyGpu->psNonPolarScalingFactors->Download();
+
+    CUDAStream<float4>* sigEps4          = freeEnergyGpu->psSigEps4;
+    sigEps4->Download();
+
+    (void) fprintf( log, "BornSum Born radii & params\n" );
+    for( int ii = 0; ii < gpu->natoms; ii++ ){
+        //(void) fprintf( log, "%6d prm[%15.7e %15.7e %15.7e %15.7e] [%15.7e %15.7e %15.7e %15.7e] bR=%15.7e bF=%15.7e swDrv=%3.1f x[%8.3f %8.3f %8.3f %15.7f]\n",
+        (void) fprintf( log, "%6d prm[%15.7e %15.7e %15.7e] sig/eps4[%15.7e %15.7e %15.7e %15.7e] bR=%15.7e bF=%15.7e\n",
+                        ii,
+
+                        gpu->psObcData->_pSysData[ii].x,
+                        gpu->psObcData->_pSysData[ii].y,
+                        freeEnergyGpu->psNonPolarScalingFactors->_pSysData[ii],
+
+                        sigEps4->_pSysData[ii].x,
+                        sigEps4->_pSysData[ii].y,
+                        sigEps4->_pSysData[ii].z,
+                        sigEps4->_pSysData[ii].w,
+
+                        gpu->psBornRadii->_pSysData[ii],
+                        gpu->psBornForce->_pSysData[ii]
+/*
+                        gpu->psPosq4->_pSysData[ii].x,
+                        gpu->psPosq4->_pSysData[ii].y,
+                        gpu->psPosq4->_pSysData[ii].z,
+                        gpu->psPosq4->_pSysData[ii].w );
+*/
+                        );
+
+        if( (ii == maxPrint) && ( ii < (gpu->natoms - maxPrint)) ){
+            ii = gpu->natoms - maxPrint;
+        }
+    }

-    return gpuObcGbsaSoftcore;
 }
+extern __global__ void kFindBlockBoundsCutoff_kernel();
+extern __global__ void kFindBlockBoundsPeriodic_kernel();
+
+extern __global__ void kFindBlocksWithInteractionsCutoff_kernel();
+extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();

-void kCalculateObcGbsaSoftcoreBornSum(gpuContext gpu)
+extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*);
+extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*);
+
+void kCalculateObcGbsaSoftcoreBornSum( freeEnergyGpuContext freeEnergyGpu )
 {
  //  printf("kCalculateObcGbsaSoftcoreBornSum\n");
+    gpuContext gpu = freeEnergyGpu->gpuContext;
+
+#ifdef DEBUG 
+fprintf( stderr, "kCalculateObcGbsaSoftcoreBornSum cutoff=%15.7e\n", gpu->sim.nonbondedCutoffSqr );
+int psize = gpu->sim.paddedNumberOfAtoms;
+CUDAStream<float4>* pdE1 = new CUDAStream<float4>( psize, 1, "pdE");
+CUDAStream<float4>* pdE2 = new CUDAStream<float4>( psize, 1, "pdE");
+float bF; 
+float bF1; 
+showWorkUnitsFreeEnergy( freeEnergyGpu, 1 );
+
+for( int ii = 0; ii < psize; ii++ ){
+
+pdE1->_pSysData[ii].x = 0.0f;
+pdE1->_pSysData[ii].y = 0.001f;
+pdE1->_pSysData[ii].z = 0.001f;
+pdE1->_pSysData[ii].w = 0.001f;
+
+pdE2->_pSysData[ii].x = 0.001f;
+pdE2->_pSysData[ii].y = 0.001f;
+pdE2->_pSysData[ii].z = 0.001f;
+pdE2->_pSysData[ii].w = 0.001f;
+}
+pdE1->Upload();
+pdE2->Upload();
+#endif
+
    kClearObcGbsaSoftcoreBornSum(gpu);
    LAUNCHERROR("kClearBornSum from kCalculateObcGbsaSoftcoreBornSum");

-    switch (gpu->sim.nonbondedMethod)
+    switch ( freeEnergyGpu->freeEnergySim.nonbondedMethod )
    {
-        case NO_CUTOFF:
-#define GBSA 0
-#if GBSA == 1
-gpu->psWorkUnit->Download();
-fprintf( stderr, "kCalculateObcGbsaSoftcoreBornSum: bOutputBufferPerWarp=%u blks=%u th/blk=%u wu=%u %u\n", gpu->bOutputBufferPerWarp,
-                 gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, gpu->sim.workUnits, gpu->psWorkUnit->_pSysData[0] );
-#endif
-#undef GBSA
+        case FREE_ENERGY_NO_CUTOFF:

+#ifdef DEBUG 
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaSoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit,  pdE1->_pDevData, pdE2->_pDevData);
+            else
+                kCalculateObcGbsaSoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit,  pdE1->_pDevData, pdE2->_pDevData);
+
+#else
            if (gpu->bOutputBufferPerWarp)
                kCalculateObcGbsaSoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
            else
                kCalculateObcGbsaSoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
+#endif
            break;
-#if 0
-        case CUTOFF:
+
+        case FREE_ENERGY_CUTOFF:
+
+            kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsCutoff");
+            kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
+            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
+            kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+
+#ifdef DEBUG
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaSoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
+            else
+                kCalculateObcGbsaSoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
+#else
+
            if (gpu->bOutputBufferPerWarp)
-                kCalculateObcGbsaCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                kCalculateObcGbsaSoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
            else
-                kCalculateObcGbsaCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                kCalculateObcGbsaSoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+#endif
+
            break;
-        case PERIODIC:
+
+        case FREE_ENERGY_PERIODIC:
+
+            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsPeriodic");
+            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
+            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
+            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+
+#ifdef DEBUG
+
            if (gpu->bOutputBufferPerWarp)
-                kCalculateObcGbsaPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                kCalculateObcGbsaSoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
+            else
+                kCalculateObcGbsaSoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, pdE1->_pDevData, pdE2->_pDevData);
+#else
+
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaSoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
            else
-                kCalculateObcGbsaPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                kCalculateObcGbsaSoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            break;
+
 #endif
+            break;
+
+        default:
+            throw OpenMM::OpenMMException( "Nonbonded softcore method not recognized." );
+
    }
    LAUNCHERROR("kCalculateObcGbsaSoftcoreBornSum");
+
+#ifdef DEBUG 
+
+pdE1->Download();
+pdE2->Download();
+//gpu->psBornRadii->Download();
+//gpu->psObcData->Download();
+fprintf( stderr, "bL Obc Cud\n" );
+bF  = 0.0;
+bF1 = 0.0;
+for( int ii = 0; ii < gpu->natoms; ii++ ){
+    bF1 += pdE1->_pSysData[ii].x;
+            fprintf( stderr, "%4d %15.7e %15.7e %15.7e %15.7e    %15.7e %15.7e %15.7e %15.7e\n", ii,
+                     pdE1->_pSysData[ii].x, pdE1->_pSysData[ii].y, pdE1->_pSysData[ii].z, pdE1->_pSysData[ii].w,
+                     pdE2->_pSysData[ii].x, pdE2->_pSysData[ii].y, pdE2->_pSysData[ii].z, pdE2->_pSysData[ii].w );
+    bF += pdE1->_pSysData[ii].x;
+}
+fprintf( stderr, "bS Obc Cud %6d %15.7e %15.7e\n", TARGET, bF, bF1 );
+#endif
+
 }
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreBornSum.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreBornSum.h
@@ -30,7 +30,22 @@
 * different versions of the kernels.
 */

-__global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, BornSum_kernel)(unsigned int* workUnit)
+#undef TARGET
+//#define TARGET 1
+
+__global__ 
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
+#endif
+#ifdef DEBUG
+void METHOD_NAME(kCalculateObcGbsaSoftcore, BornSum_kernel)(unsigned int* workUnit, float4* pdE1, float4* pdE2)
+#else
+void METHOD_NAME(kCalculateObcGbsaSoftcore, BornSum_kernel)(unsigned int* workUnit)
+#endif
 {
    extern __shared__ Atom sA[];
    unsigned int totalWarps   = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
@@ -51,15 +66,18 @@ __global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, BornSum_kernel)(unsigned
        unsigned int x = workUnit[pos];
        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
        x              = (x >> 17) << GRIDBITS;
+
        float       dx;
        float       dy;
        float       dz;
+
        float       r2;
        float       r;

        unsigned int tgx = threadIdx.x & (GRID - 1);
        unsigned int tbx = threadIdx.x - tgx;
        unsigned int tj  = tgx;
+
        Atom* psA        = &sA[tbx];

        if (x == y) // Handle diagonals uniquely at 50% efficiency
@@ -69,9 +87,11 @@ __global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, BornSum_kernel)(unsigned
            float4 apos                             = cSim.pPosq[i];    // Local atom x, y, z, sum
            float2 ar                               = cSim.pObcData[i];   // Local atom vr, sr
            float polarScaleData                    = gbsaSimDev.pNonPolarScalingFactors[i];  // scale contribution
+
            sA[threadIdx.x].x                       = apos.x;
            sA[threadIdx.x].y                       = apos.y;
            sA[threadIdx.x].z                       = apos.z;
+
            sA[threadIdx.x].r                       = ar.x;
            sA[threadIdx.x].sr                      = ar.y;
            sA[threadIdx.x].polarScaleData          = polarScaleData;
@@ -87,35 +107,60 @@ __global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, BornSum_kernel)(unsigned
                dy                     -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
                dz                     -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
 #endif
+
                r2                      = dx * dx + dy * dy + dz * dz;
-#if defined USE_PERIODIC
+
+#if defined USE_CUTOFF
                if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#elif defined USE_CUTOFF
-                if (r2 < cSim.nonbondedCutoffSqr)
+#else
+                if (i < cSim.atoms && x+j < cSim.atoms )
 #endif
                {
                    r                       = sqrt(r2);
-                    float rInverse          = 1.0f / r;
+                    float rInverse          = 1.0f/r;
                    float rScaledRadiusJ    = r + psA[j].sr;
-                    if ((j != tgx) && (ar.x < rScaledRadiusJ))
-                    {
+                    if( (j != tgx) && (ar.x < rScaledRadiusJ) ){
                        float l_ij     = 1.0f / max(ar.x, fabs(r - psA[j].sr));
                        float u_ij     = 1.0f / rScaledRadiusJ;
                        float l_ij2    = l_ij * l_ij;
                        float u_ij2    = u_ij * u_ij;
                        float ratio    = log(u_ij / l_ij);
-                        float sum      = l_ij -
-                                         u_ij +
+                        float term     = l_ij - u_ij +
                                         0.25f * r * (u_ij2 - l_ij2) +
                                         (0.50f * rInverse * ratio) +
                                         (0.25f * psA[j].sr * psA[j].sr * rInverse) *
                                         (l_ij2 - u_ij2);
-                        float rj = psA[j].r;
-                        if (ar.x < (rj - r))
-                        {
-                            sum += 2.0f * ((1.0f / ar.x) - l_ij);
+                        float rj       = psA[j].sr;
+                        if( ar.x < (rj - r) ){
+                            term += 2.0f * ((1.0f / ar.x) - l_ij);
                        }
-                        apos.w +=  psA[j].polarScaleData*sum;
+                        apos.w += psA[j].polarScaleData*term;
+#ifdef DEBUG
+int jIdx = j;
+if( i == TARGET ){
+
+int tjj     = y+jIdx;
+pdE1[tjj].x = term;
+pdE1[tjj].y = r;
+pdE1[tjj].z = ar.x;
+pdE1[tjj].w = 1.0f;
+
+pdE2[tjj].x = r;
+pdE2[tjj].y = l_ij;
+pdE2[tjj].z = rj;
+pdE2[tjj].w = 1.0f;
+}
+/*
+if( (y+jIdx) == TARGET ){
+int tjj     = i;
+pdE2[tjj].x = sum;
+pdE2[tjj].y = psA[jIdx].polarScaleData;
+pdE2[tjj].z = ar.x;
+pdE2[tjj].w = -1.0f;
+} */
+#endif
+
+
                    }
                }
            }
@@ -128,41 +173,49 @@ __global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, BornSum_kernel)(unsigned
            unsigned int offset   = x + tgx + (x >> GRIDBITS) * cSim.stride;
            cSim.pBornSum[offset] = apos.w;
 #endif
-        }
-        else        // 100% utilization
-        {
+
+        } else {
+
            // Read fixed atom data into registers and GRF
+
            unsigned int j                  = y + tgx;
            unsigned int i                  = x + tgx;

            float4 temp                     = cSim.pPosq[j];
            float2 temp1                    = cSim.pObcData[j];
            float polarScaleDataJ           = gbsaSimDev.pNonPolarScalingFactors[j];  // scale contribution
+
            float4 apos                     = cSim.pPosq[i];        // Local atom x, y, z, sum
+            apos.w                          = 0.0f;
+
            float2 ar                       = cSim.pObcData[i];    // Local atom vr, sr
            float polarScaleDataI           = gbsaSimDev.pNonPolarScalingFactors[i];  // scale contribution
+
            sA[threadIdx.x].x               = temp.x;
            sA[threadIdx.x].y               = temp.y;
            sA[threadIdx.x].z               = temp.z;
+
            sA[threadIdx.x].r               = temp1.x;
            sA[threadIdx.x].sr              = temp1.y;
+
            sA[threadIdx.x].polarScaleData  = polarScaleDataJ;
-            sA[threadIdx.x].sum = apos.w    = 0.0f;
+
+            sA[threadIdx.x].sum             = 0.0f;

 #ifdef USE_CUTOFF
-            //unsigned int flags = cSim.pInteractionFlag[pos + (blockIdx.x*numWorkUnits)/gridDim.x];
            unsigned int flags              = cSim.pInteractionFlag[pos];
            if (flags == 0)
            {
                // No interactions in this block.
            }
-            else if (flags == 0xFFFFFFFF)
+//            else if (flags == 0xFFFFFFFF)
+            else if (flags )
 #endif
            {
                // Compute all interactions within this block.

-                for (unsigned int j = 0; j < GRID; j++)
-                {
+                for( unsigned int j = 0; j < GRID; j++ ){
+
                    dx  = psA[tj].x - apos.x;
                    dy  = psA[tj].y - apos.y;
                    dz  = psA[tj].z - apos.z;
@@ -172,10 +225,10 @@ __global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, BornSum_kernel)(unsigned
                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
 #endif
                    r2                      = dx * dx + dy * dy + dz * dz;
-#ifdef USE_PERIODIC
+#ifdef USE_CUTOFF
                    if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#elif defined USE_CUTOFF
-                    if (r2 < cSim.nonbondedCutoffSqr)
+#else
+                    if (i < cSim.atoms && y+tj < cSim.atoms )
 #endif
                    {
                        r                       = sqrt(r2);
@@ -188,8 +241,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, BornSum_kernel)(unsigned
                            float l_ij2    = l_ij * l_ij;
                            float u_ij2    = u_ij * u_ij;
                            float ratio    = log(u_ij / l_ij);
-                            float term     = l_ij -
-                                             u_ij +
+                            float term     = l_ij - u_ij +
                                             0.25f * r * (u_ij2 - l_ij2) +
                                             (0.50f * rInverse * ratio) +
                                             (0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
@@ -200,8 +252,39 @@ __global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, BornSum_kernel)(unsigned
                            {
                                term += 2.0f * ((1.0f / ar.x) - l_ij);
                            }
-                            //apos.w        += term;
                            apos.w        += (scale*term);
+
+#ifdef DEBUG
+int jIdx = tj;
+if( i == TARGET ){
+
+int tjj     = y+jIdx;
+pdE1[tjj].x = term;
+pdE1[tjj].y = r;
+pdE1[tjj].z = ar.x;
+pdE1[tjj].w = 2.0f;
+/*
+pdE2[tjj].x = r;
+pdE2[tjj].y = l_ij;
+pdE2[tjj].z = rj;
+pdE2[tjj].w = 2.0f;
+*/
+}
+
+if( (y+jIdx) == TARGET ){
+int tjj     = i;
+/*
+pdE1[tjj].x = term;
+pdE1[tjj].y = r;
+pdE1[tjj].z = ar.x;
+pdE1[tjj].w = -2.0f;
+*/
+pdE2[tjj].x = term;
+pdE2[tjj].y = r;
+pdE2[tjj].z = ar.x;
+pdE2[tjj].w = -2.0f;
+}
+#endif
                        }
                        float rScaledRadiusI    = r + ar.y;
                        if (psA[tj].r < rScaledRadiusI)
@@ -211,18 +294,38 @@ __global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, BornSum_kernel)(unsigned
                            float l_ij2    = l_ij * l_ij;
                            float u_ij2    = u_ij * u_ij;
                            float ratio    = log(u_ij / l_ij);
-                            float term     = l_ij -
-                                             u_ij +
+                            float term     = l_ij - u_ij +
                                             0.25f * r * (u_ij2 - l_ij2) +
                                             (0.50f * rInverse * ratio) +
                                             (0.25f * ar.y * ar.y * rInverse) *
                                             (l_ij2 - u_ij2);
+
                            float rj = psA[tj].r;
                            if (rj < (ar.y - r))
                            {
                                term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
                            }
                            psA[tj].sum    += polarScaleDataI*term;
+
+#ifdef DEBUG
+int jIdx = tj;
+if( i == TARGET ){
+
+int tjj     = y+jIdx;
+pdE1[tjj].x = term;
+pdE1[tjj].y = r;
+pdE1[tjj].z = ar.x;
+pdE1[tjj].w = 3.0f;
+}
+
+if( (y+jIdx) == TARGET ){
+int tjj     = i;
+pdE2[tjj].x = term;
+pdE2[tjj].y = r;
+pdE2[tjj].z = ar.x;
+pdE2[tjj].w = -3.0f;
+}
+#endif
                        }
                    }
                    tj = (tj - 1) & (GRID - 1);
@@ -247,10 +350,10 @@ __global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, BornSum_kernel)(unsigned
                        dz                     -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
 #endif
                        r2                      = dx * dx + dy * dy + dz * dz;
-#ifdef USE_PERIODIC
+#ifdef USE_CUTOFF
                        if (i < cSim.atoms && y+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#elif defined USE_CUTOFF
-                        if (r2 < cSim.nonbondedCutoffSqr)
+#else
+                        if (i < cSim.atoms && y+j < cSim.atoms )
 #endif
                        {
                            r                       = sqrt(r2);

--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreForces2.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreForces2.cu
@@ -34,7 +34,7 @@
 using namespace std;

 #include "gputypes.h"
-#include "GpuObcGbsaSoftcore.h"
+#include "freeEnergyGpuTypes.h"

 struct Atom {
    float x;
@@ -49,38 +49,19 @@ struct Atom {
    float fb;
 };

-struct cudaFreeEnergySimulationObcGbsaSoftcore {
-    float* pNonPolarScalingFactors;
-};
-struct cudaFreeEnergySimulationObcGbsaSoftcore gbsaSimObc2;
-
 static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaFreeEnergySimulationObcGbsaSoftcore gbsaSimDev;
+static __constant__ cudaFreeEnergyGmxSimulation feSimDev;

 extern "C"
-void SetCalculateObcGbsaSoftcoreForces2Sim(gpuContext gpu)
+void SetCalculateObcGbsaSoftcoreForces2Sim( freeEnergyGpuContext freeEnergyGpu )
 {
    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
+    status = cudaMemcpyToSymbol(cSim, &freeEnergyGpu->gpuContext->sim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateObcGbsaSoftcoreForces2Sim copy to cSim failed");

-extern "C" 
-void SetCalculateObcGbsaSoftcoreNonPolarScalingFactorsObc2Sim( float* nonPolarScalingFactors )
-{
-    cudaError_t status;
-    gbsaSimObc2.pNonPolarScalingFactors = nonPolarScalingFactors;
-    status                              = cudaMemcpyToSymbol(gbsaSimDev, &gbsaSimObc2, sizeof(cudaFreeEnergySimulationObcGbsaSoftcore));
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateObcGbsaSoftcoreNonPolarScalingFactorsObc2Sim");
+    status = cudaMemcpyToSymbol( feSimDev, &freeEnergyGpu->freeEnergySim, sizeof(cudaFreeEnergyGmxSimulation));
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateObcGbsaSoftcoreForces2Sim copy to feSimDev failed");

-    //(void) fprintf( stderr, "In SetCalculateObcGbsaSoftcoreNonPolarScalingFactorsObc2Sim\n" );
-}
-
-void GetCalculateObcGbsaSoftcoreForces2Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
 }

 // Include versions of the kernels for N^2 calculations.
@@ -116,15 +97,14 @@ void GetCalculateObcGbsaSoftcoreForces2Sim(gpuContext gpu)
 #define METHOD_NAME(a, b) a##PeriodicByWarp##b
 #include "kCalculateObcGbsaSoftcoreForces2.h"

-void kCalculateObcGbsaSoftcoreForces2(gpuContext gpu)
+void kCalculateObcGbsaSoftcoreForces2( freeEnergyGpuContext freeEnergyGpu )
 {
    //printf("kCalculateObcGbsaSoftcoreForces2\n");
-    //fprintf( stderr, "kCalculateObcGbsaSoftcoreForces2 nonbondedMethod=%d warp=%d\n", gpu->sim.nonbondedMethod, gpu->bOutputBufferPerWarp);
-//fprintf( stderr, "kCalculateObcGbsaSoftcoreForces2 nonbondedMethod=%d calling kReduceForces\n", gpu->sim.nonbondedMethod);
-//kReduceForces(gpu);
-    switch (gpu->sim.nonbondedMethod)
+    gpuContext gpu                     = freeEnergyGpu->gpuContext;
+    switch (freeEnergyGpu->freeEnergySim.nonbondedMethod)
    {
-        case NO_CUTOFF:
+        case FREE_ENERGY_NO_CUTOFF:
+
            if (gpu->bOutputBufferPerWarp)
                kCalculateObcGbsaSoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
@@ -132,7 +112,9 @@ void kCalculateObcGbsaSoftcoreForces2(gpuContext gpu)
                kCalculateObcGbsaSoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
            break;
-        case CUTOFF:
+
+        case FREE_ENERGY_CUTOFF:
+
            if (gpu->bOutputBufferPerWarp)
                kCalculateObcGbsaSoftcoreCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
@@ -140,7 +122,9 @@ void kCalculateObcGbsaSoftcoreForces2(gpuContext gpu)
                kCalculateObcGbsaSoftcoreCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
            break;
-        case PERIODIC:
+
+        case FREE_ENERGY_PERIODIC:
+
            if (gpu->bOutputBufferPerWarp)
                kCalculateObcGbsaSoftcorePeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);

--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreForces2.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreForces2.h
@@ -30,7 +30,15 @@
 * different versions of the kernels.
 */

-__global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, Forces2_kernel)(unsigned int* workUnit)
+__global__
+#if (__CUDA_ARCH__ >= 200)
+__launch_bounds__(GF1XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
+#elif (__CUDA_ARCH__ >= 120)
+__launch_bounds__(GT2XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
+#else
+__launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1)
+#endif
+void METHOD_NAME(kCalculateObcGbsaSoftcore, Forces2_kernel)(unsigned int* workUnit)
 {
    extern __shared__ Atom sA[];
    unsigned int totalWarps   = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
@@ -55,14 +63,19 @@ __global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, Forces2_kernel)(unsigned
        float4 apos                     = cSim.pPosq[i];
        float2 a                        = cSim.pObcData[i];
        float fb                        = cSim.pBornForce[i];
-        float  nonPolarScaleDataI       = gbsaSimDev.pNonPolarScalingFactors[i];
+        float  nonPolarScaleDataI       = feSimDev.pNonPolarScalingFactors[i];
        unsigned int tbx                = threadIdx.x - tgx;
        unsigned int tj                 = tgx;
+
        Atom* psA                       = &sA[tbx];
+        sA[threadIdx.x].fx              = 0.0f;
+        sA[threadIdx.x].fy              = 0.0f;
+        sA[threadIdx.x].fz              = 0.0f;
+
        float3 af;
-        sA[threadIdx.x].fx = af.x   = 0.0f;
-        sA[threadIdx.x].fy = af.y   = 0.0f;
-        sA[threadIdx.x].fz = af.z   = 0.0f;
+        af.x                            = 0.0f;
+        af.y                            = 0.0f;
+        af.z                            = 0.0f;
        if (x == y) // Handle diagonals uniquely at 50% efficiency
        {
            // Read fixed atom data into registers and GRF
@@ -104,28 +117,28 @@ __global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, Forces2_kernel)(unsigned
                t1                     *= rInverse;

                // Born Forces term
-                float term          =  0.125f *
-                                      (1.000f + psA[j].sr * psA[j].sr * r2Inverse) * t3 +
+                float term              =  0.125f * (1.000f + psA[j].sr * psA[j].sr * r2Inverse) * t3 +
                                           0.250f * t1 * r2Inverse;
                term                   *= psA[j].npScale*nonPolarScaleDataI;
                float dE                = fb * term;

-#if defined USE_PERIODIC
+#if defined USE_CUTOFF
                if (a.x >= rScaledRadiusJ || i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#elif defined USE_CUTOFF
-                if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
 #else
-                if (a.x >= rScaledRadiusJ)
+                if (a.x >= rScaledRadiusJ || i >= cSim.atoms || x+j >= cSim.atoms )
 #endif
                {
                    dE              = 0.0f;
                }
+
                float d             = dx * dE;
                af.x               -= d;
                psA[j].fx          += d;
+
                d                   = dy * dE;
                af.y               -= d;
                psA[j].fy          += d;
+
                d                   = dz * dE;
                af.z               -= d;
                psA[j].fz          += d;
@@ -144,9 +157,9 @@ __global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, Forces2_kernel)(unsigned
            of.z                       += af.z + sA[threadIdx.x].fz;
            of.w                        = 0.0f;
            cSim.pForce4[offset]        = of;
-        }
-        else
-        {
+
+        } else {
+
            // Read fixed atom data into registers and GRF
            if (lasty != y)
            {
@@ -154,7 +167,7 @@ __global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, Forces2_kernel)(unsigned
                float4 temp                 = cSim.pPosq[j];
                float2 temp1                = cSim.pObcData[j];
                sA[threadIdx.x].fb          = cSim.pBornForce[j];
-                sA[threadIdx.x].npScale     = gbsaSimDev.pNonPolarScalingFactors[j];
+                sA[threadIdx.x].npScale     = feSimDev.pNonPolarScalingFactors[j];
                sA[threadIdx.x].x           = temp.x;
                sA[threadIdx.x].y           = temp.y;
                sA[threadIdx.x].z           = temp.z;
@@ -215,12 +228,10 @@ __global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, Forces2_kernel)(unsigned
                    term                   *= psA[tj].npScale*nonPolarScaleDataI;
                    float dE                = fb * term;

-#if defined USE_PERIODIC
+#if defined USE_CUTOFF
                    if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#elif defined USE_CUTOFF
-                    if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
 #else
-                    if (a.x >= rScaledRadiusJ)
+                    if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+tj >= cSim.atoms)
 #endif
                    {
                        dE                  = 0.0f;
@@ -244,12 +255,10 @@ __global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, Forces2_kernel)(unsigned
                    dE                      = psA[tj].fb * term;

                    float rj = psA[tj].r;
-#ifdef USE_PERIODIC
+#ifdef USE_CUTOFF
                    if (rj >= rScaledRadiusI || i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#elif defined USE_CUTOFF
-                    if (rj >= rScaledRadiusI || r2 > cSim.nonbondedCutoffSqr)
 #else
-                    if (rj >= rScaledRadiusI)
+                    if (rj >= rScaledRadiusI || i >= cSim.atoms || y+tj >= cSim.atoms )
 #endif
                    {
                        dE                  = 0.0f;

--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kSoftcoreLJ.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kSoftcoreLJ.h
@@ -35,14 +35,14 @@
 static __device__ float getSoftCoreLJ( float r2, float sig, float  eps, float lambdaI, float lambdaJ, float* energy)
 {

-   float r                         = sqrt(r2);
   float lambda                    = lambdaI < lambdaJ ? lambdaI : lambdaJ;
   eps                            *= lambda;


    // (r/sig)
-    float sig2                     = r/sig;
+    float sig2                     = 1.0f/sig;
          sig2                    *= sig2;
+          sig2                    *= r2;
    float sig6                     = sig2*sig2*sig2;

    float softcoreLJTerm           = 0.5f*( 1.0f -  lambda) + sig6;
@@ -53,6 +53,27 @@ static __device__ float getSoftCoreLJ( float r2, float sig, float  eps, float la
    return eps*softcoreLJInv2*( 12.0f*softcoreLJInv - 6.0f )*sig6;
    
 }
+
+static __device__ float getSoftCoreLJMod( float sigInvR, float  eps, float lambdaI, float lambdaJ, float* energy)
+{
+
+   float lambda                    = lambdaI < lambdaJ ? lambdaI : lambdaJ;
+   eps                            *= lambda;
+
+
+    // (r/sig)
+    float sig2                     = sigInvR*sigInvR;
+    float sig6                     = sig2*sig2*sig2;
+
+    float softcoreLJTerm           = 0.5f*( 1.0f -  lambda) + sig6;
+    float softcoreLJInv            = 1.0f/softcoreLJTerm;
+    float softcoreLJInv2           = softcoreLJInv*softcoreLJInv;
+    *energy                        = eps*(softcoreLJInv2 - softcoreLJInv);
+
+    return eps*softcoreLJInv2*( 12.0f*softcoreLJInv - 6.0f )*sig6;
+    
+}
+
 #endif

 #endif
--- a/plugins/freeEnergy/platforms/cuda/tests/CMakeLists.txt
+++ b/plugins/freeEnergy/platforms/cuda/tests/CMakeLists.txt
@@ -7,18 +7,34 @@ INCLUDE_DIRECTORIES(${CUDA_INCLUDE})
 INCLUDE_DIRECTORIES(${OPENMM_DIR}/platforms/cuda/include)
 INCLUDE_DIRECTORIES(${OPENMM_DIR}/platforms/cuda/src)
 INCLUDE_DIRECTORIES(${OPENMM_DIR}/platforms/cuda/src/kernels)
-Set( SHARED_OPENMM_TARGET OpenMMFreeEnergy)
-Set( STATIC_OPENMM_TARGET OpenMMFreeEnergy_static)
-Set( SHARED_CUDA_TARGET OpenMMCuda)
-Set( STATIC_CUDA_TARGET OpenMMCuda_static)
+
+# serialize test cases for GBVI and GBSAOBC softcore runs
+# if INCLUDE_SERIALIZATION is TRUE
+
+SET( INCLUDE_SERIALIZATION FALSE )
+#SET( INCLUDE_SERIALIZATION TRUE )
+
+SET( SHARED_OPENMM_TARGET OpenMMFreeEnergy)
+SET( STATIC_OPENMM_TARGET OpenMMFreeEnergy_static)
+SET( SHARED_CUDA_TARGET OpenMMCuda)
+SET( STATIC_CUDA_TARGET OpenMMCuda_static)
+
+IF( INCLUDE_SERIALIZATION )
+    INCLUDE_DIRECTORIES(${OPENMM_DIR}/serialization/include)
+    SET( SHARED_OPENMM_SERIALIZATION OpenMMSerialization )
+    SET( SHARED_FREE_ENERGY_SERIALIZATION FreeEnergySerialization )
+ENDIF( INCLUDE_SERIALIZATION )
+
 IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
    SET(SHARED_CUDA_TARGET   ${SHARED_CUDA_TARGET}_d)
    SET(SHARED_OPENMM_TARGET ${SHARED_OPENMM_TARGET}_d)
+    IF( INCLUDE_SERIALIZATION )
+        SET(SHARED_OPENMM_SERIALIZATION ${SHARED_OPENMM_SERIALIZATION}_d)
+        SET(SHARED_FREE_ENERGY_SERIALIZATION ${SHARED_FREE_ENERGY_SERIALIZATION}_d)
+    ENDIF( INCLUDE_SERIALIZATION )
    SET(STATIC_CUDA_TARGET ${STATIC_CUDA_TARGET}_d)
-    Set(STATIC_OPENMM_TARGET ${STATIC_OPENMM_TARGET}_d)
+    SET(STATIC_OPENMM_TARGET ${STATIC_OPENMM_TARGET}_d)
 ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
-#LINK_DIRECTORIES
-

 # Automatically create tests using files named "Test*.cpp"
 FILE(GLOB TEST_PROGS "*Test*.cpp")
@@ -26,45 +42,28 @@ FOREACH(TEST_PROG ${TEST_PROGS})
    GET_FILENAME_COMPONENT(TEST_ROOT ${TEST_PROG} NAME_WE)

    # Link with shared library
+
    CUDA_ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
+    IF( INCLUDE_SERIALIZATION )
+        TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_TARGET} ${SHARED_OPENMM_TARGET} ${SHARED_CUDA_TARGET} ${SHARED_OPENMM_SERIALIZATION} ${SHARED_FREE_ENERGY_SERIALIZATION})
+    ELSE( INCLUDE_SERIALIZATION )
        TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_TARGET} ${SHARED_OPENMM_TARGET} ${SHARED_CUDA_TARGET})
-    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})
+    ENDIF( INCLUDE_SERIALIZATION )
+    SET(DEFINE_STRING "-DUSE_SOFTCORE")
+    IF( INCLUDE_SERIALIZATION )
+        SET(DEFINE_STRING "${DEFINE_STRING} -DOPENMM_SERIALIZE")
+    ENDIF( INCLUDE_SERIALIZATION )

+    IF( ${TEST_ROOT} STREQUAL "TestCudaOBCSoftcoreForce" )
+        SET(DEFINE_STRING "${DEFINE_STRING} -DIMPLICIT_SOLVENT=1")
+    ENDIF( ${TEST_ROOT} STREQUAL "TestCudaOBCSoftcoreForce" )

+    IF( ${TEST_ROOT} STREQUAL "TestCudaGBVISoftcoreForce" )
+        SET(DEFINE_STRING "${DEFINE_STRING} -DIMPLICIT_SOLVENT=2")
+    ENDIF( ${TEST_ROOT} STREQUAL "TestCudaGBVISoftcoreForce" )

-    # Link with static library
-#     SET(TEST_STATIC ${TEST_ROOT}Static)
-#     CUDA_ADD_EXECUTABLE(${TEST_STATIC} ${TEST_PROG})
-#     SET_TARGET_PROPERTIES(${TEST_STATIC}
-#                 PROPERTIES
-#                 COMPILE_FLAGS "-DOPENMM_USE_STATIC_LIBRARIES"
-#                 )
-#     TARGET_LINK_LIBRARIES(${TEST_STATIC} ${STATIC_TARGET} ${STATIC_OPENMM_TARGET} ${STATIC_CUDA_TARGET})
-#     ADD_TEST(${TEST_STATIC} ${EXECUTABLE_OUTPUT_PATH}/${TEST_STATIC})
+    MESSAGE( "${TEST_ROOT} ${DEFINE_STRING}" )
+    SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES COMPILE_FLAGS ${DEFINE_STRING} )
+    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})

 ENDFOREACH(TEST_PROG ${TEST_PROGS})
-
-# TestCudaUsingParameterFile customized w/ command-line argument (input file name used in test) 
-
-#ADD_EXECUTABLE(TestFreeEnergyCudaUsingParameterFile TstFreeEnergyCudaUsingParameterFile.cpp)
-#TARGET_LINK_LIBRARIES(TestFreeEnergyCudaUsingParameterFile ${SHARED_TARGET} ${SHARED_OPENMM_TARGET} ${SHARED_CUDA_TARGET})
-#ADD_TEST(TestCudaUsingParameterFile "${EXECUTABLE_OUTPUT_PATH}/TestCudaUsingParameterFile" "-parameterFileName" "${CMAKE_CURRENT_SOURCE_DIR}/lambdaSdObcParameters.txt")
-#ADD_TEST(TestCudaUsingParameterFile "${EXECUTABLE_OUTPUT_PATH}/TestCudaUsingParameterFile" "-parameterFileName" "${CMAKE_CURRENT_SOURCE_DIR}/bptiMdRfNoPbcParameters.txt")
-#
-#SET(TEST_ROOT TestCudaUsingParameterFile)
-#SET(TEST_PROG TstCudaUsingParameterFile.cpp)
-#SET(TEST_STATIC ${TEST_ROOT}Static)
-#SET(INCLUDE_CUDA_STATIC 1)
-#IF(INCLUDE_CUDA_STATIC)
-#   ADD_EXECUTABLE(${TEST_STATIC} ${TEST_PROG})
-#   SET_TARGET_PROPERTIES(${TEST_STATIC}
-#                         PROPERTIES
-#                         COMPILE_FLAGS "-DOPENMM_USE_STATIC_LIBRARIES"
-#                        )
-#   TARGET_LINK_LIBRARIES(${TEST_STATIC} ${STATIC_TARGET} ${STATIC_BROOK_TARGET})
-#   ADD_TEST(${TEST_STATIC} "${EXECUTABLE_OUTPUT_PATH}/TestCudaUsingParameterFileStatic" "-parameterFileName" "${CMAKE_CURRENT_SOURCE_DIR}/lambdaSdObcParameters.txt")
-#   ADD_TEST(${TEST_STATIC} "${EXECUTABLE_OUTPUT_PATH}/TestCudaUsingParameterFileStatic" "-parameterFileName" "${CMAKE_CURRENT_SOURCE_DIR}/bptiMdRfNoPbcParameters.txt")
-#  ADD_TEST(${TEST_STATIC} "${EXECUTABLE_OUTPUT_PATH}/TestCudaUsingParameterFileStatic" "-parameterFileName" "${CMAKE_CURRENT_SOURCE_DIR}/bptiMdRfPbcParameters.txt" " +checkEnergyForceConsistent -checkForces" )
-#ENDIF(INCLUDE_CUDA_STATIC)
-
-
--- a/plugins/freeEnergy/platforms/cuda/tests/TestCudaGBVISoftcoreForce.cpp
+++ b/plugins/freeEnergy/platforms/cuda/tests/TestCudaGBVISoftcoreForce.cpp
@@ -33,80 +33,32 @@
 * This tests the reference implementation of GBVIForce.
 */

-#include "../../../tests/AssertionUtilities.h"
-#include "openmm/Context.h"
-#include "CudaPlatform.h"
-#include "ReferencePlatform.h"
-#include "openmm/GBVISoftcoreForce.h"
-#include "openmm/GBSAOBCForce.h"
-#include "openmm/System.h"
-#include "openmm/LangevinIntegrator.h"
-#include "openmm/NonbondedForce.h"
-#include "openmm/NonbondedSoftcoreForce.h"
-#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
-#include "OpenMMFreeEnergy.h"
-#include "openmm/freeEnergyKernels.h"
-#include "ReferenceFreeEnergyKernelFactory.h"
-#include "CudaFreeEnergyKernelFactory.h"
+#include "TestCudaSoftcoreForce.h"

-#include <iostream>
-#include <cstdio>
-#include <vector>
+//#define USE_SOFTCORE
+//#define IMPLICIT_SOLVENT GBVI
+//#define IMPLICIT_SOLVENT OBC

-using namespace OpenMM;
-using namespace std;
+#define OBC_FLAG  1
+#define GBVI_FLAG 2

-const double TOL = 1e-5;
-
-#define PRINT_ON 0
-
-int compareForcesOfTwoStates( int numParticles, State& state1, State& state2, double relativeTolerance, double absoluteTolerance ) {
+#include "openmm/GBVIForce.h"
+#include "openmm/GBSAOBCForce.h"
+#include "openmm/NonbondedForce.h"

-    int error = 0;
-    for (int i = 0; i < numParticles; ++i) {
-        Vec3 f1       = state1.getForces()[i];
-        Vec3 f2       = state2.getForces()[i];
-        double diff   = (f1[0] - f2[0])*(f1[0] - f2[0]) +
-                        (f1[1] - f2[1])*(f1[1] - f2[1]) +
-                        (f1[2] - f2[2])*(f1[2] - f2[2]); 
-        double denom1 = fabs( f1[0] ) + fabs( f1[1] ) +fabs( f1[2] );
-        double denom2 = fabs( f2[0] ) + fabs( f2[1] ) +fabs( f2[2] );
-        int        ok = 1;
-        if( (denom1 > 0.0 || denom2 > 0.0) && (sqrt( diff )/(denom1+denom2)) > relativeTolerance ){
-           error++;
-           ok = 0;
-        }
-#if PRINT_ON == 1
-        (void) fprintf( stderr, "F %d [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e] %s\n", i, 
-                        f1[0], f1[1], f1[2], f2[0], f2[1], f2[2], (ok ? "":"XXXXXX") );
+#ifdef USE_SOFTCORE
+#include "openmm/GBVISoftcoreForce.h"
+#include "openmm/GBSAOBCSoftcoreForce.h"
+#include "openmm/NonbondedSoftcoreForce.h"
 #endif
-    }
-
-    return error;
-}
-
-void testSingleParticle() {

-#if 1
-    CudaPlatform platform;
-    CudaFreeEnergyKernelFactory* factory  = new CudaFreeEnergyKernelFactory();
-    platform.registerKernelFactory(CalcNonbondedSoftcoreForceKernel::Name(), factory);
-    platform.registerKernelFactory(CalcGBVISoftcoreForceKernel::Name(), factory);
-    platform.registerKernelFactory(CalcGBSAOBCSoftcoreForceKernel::Name(), factory);
-
-#else
+#include <iomanip>

-    ReferencePlatform platform;
-
-    ReferenceFreeEnergyKernelFactory* referenceFactoryT  = new ReferenceFreeEnergyKernelFactory();
-    platform.registerKernelFactory(CalcNonbondedSoftcoreForceKernel::Name(), referenceFactoryT);
-    platform.registerKernelFactory(CalcGBVISoftcoreForceKernel::Name(), referenceFactoryT);
-    platform.registerKernelFactory(CalcGBSAOBCSoftcoreForceKernel::Name(), referenceFactoryT);
-#endif
+void testSingleParticle( FILE* log ) {

    System system;
    system.addParticle(2.0);
-    LangevinIntegrator integrator(0, 0.1, 0.01);
+    VerletIntegrator integrator(0.01);

    GBVISoftcoreForce* forceField = new GBVISoftcoreForce;

@@ -121,7 +73,7 @@ void testSingleParticle() {
    nonbonded->addParticle( charge, 1.0, 0.0);
    system.addForce(nonbonded);

-    Context context(system, integrator, platform);
+    Context context(system, integrator, Platform::getPlatformByName( "Cuda") );
    vector<Vec3> positions(1);
    positions[0] = Vec3(0, 0, 0);
    context.setPositions(positions);
@@ -132,46 +84,22 @@ void testSingleParticle() {
    double tau            = (1.0/forceField->getSoluteDielectric()-1.0/forceField->getSolventDielectric());

    double bornEnergy     = (-charge*charge/(8*PI_M*eps0))*tau/bornRadius;
-    double nonpolarEnergy = -0.1*gamma*tau*std::pow( radius/bornRadius, 3.0);
+    double nonpolarEnergy = -gamma*tau*std::pow( radius/bornRadius, 3.0);

    double expectedE      = (bornEnergy+nonpolarEnergy); 
    double obtainedE      = state.getPotentialEnergy(); 
    double diff           = fabs( obtainedE - expectedE );
-#if PRINT_ON == 1
-    (void) fprintf( stderr, "testSingleParticle expected=%14.6e obtained=%14.6e diff=%14.6e breakdown:[%14.6e %14.6e]\n",
+    if( log ){
+        (void) fprintf( log, "testSingleParticle expected=%14.6e obtained=%14.6e diff=%14.6e breakdown:[%14.6e %14.6e]\n",
                        expectedE, obtainedE, diff, bornEnergy, nonpolarEnergy );
-#endif
+    }
    ASSERT_EQUAL_TOL((bornEnergy+nonpolarEnergy), state.getPotentialEnergy(), 0.01);
 }

-void testEnergyEthaneSwitchingFunction( int useSwitchingFunction ) {
+void testEnergyEthaneSwitchingFunction( int useSwitchingFunction, FILE* log ) {

    std::string methodName = "testEnergyEthaneSwitchingFunction";

-#if 1
-    CudaPlatform platform;
-    CudaFreeEnergyKernelFactory* factory  = new CudaFreeEnergyKernelFactory();
-    platform.registerKernelFactory(CalcNonbondedSoftcoreForceKernel::Name(), factory);
-    platform.registerKernelFactory(CalcGBVISoftcoreForceKernel::Name(), factory);
-    platform.registerKernelFactory(CalcGBSAOBCSoftcoreForceKernel::Name(), factory);
-
-#else
-
-    ReferencePlatform platform;
-
-    ReferenceFreeEnergyKernelFactory* referenceFactoryT  = new ReferenceFreeEnergyKernelFactory();
-    platform.registerKernelFactory(CalcNonbondedSoftcoreForceKernel::Name(), referenceFactoryT);
-    platform.registerKernelFactory(CalcGBVISoftcoreForceKernel::Name(), referenceFactoryT);
-    platform.registerKernelFactory(CalcGBSAOBCSoftcoreForceKernel::Name(), referenceFactoryT);
-#endif
-
-    ReferencePlatform referencePlatform;
-
-    ReferenceFreeEnergyKernelFactory* referenceFactory  = new ReferenceFreeEnergyKernelFactory();
-    referencePlatform.registerKernelFactory(CalcNonbondedSoftcoreForceKernel::Name(), referenceFactory);
-    referencePlatform.registerKernelFactory(CalcGBVISoftcoreForceKernel::Name(), referenceFactory);
-    referencePlatform.registerKernelFactory(CalcGBSAOBCSoftcoreForceKernel::Name(), referenceFactory);
-
    System system;
    const int numParticles = 8;
    for( int i = 0; i < numParticles; i++ ){
@@ -215,12 +143,12 @@ void testEnergyEthaneSwitchingFunction( int useSwitchingFunction ) {
    //double bornRadiusScaleFactorsEven = 1.0;
    //double bornRadiusScaleFactorsOdd  = 0.5;
    double bornRadiusScaleFactorsOdd  = 1.0;
-#if PRINT_ON == 1
-    (void) fprintf( stderr, "%s: Applying GB/VI\n", methodName.c_str() );
-    (void) fprintf( stderr, "C[%14.7e %14.7e %14.7e] H[%14.7e %14.7e %14.7e] scale[%.1f %.1f]\n",
+    if( log ){
+        (void) fprintf( log, "%s: Applying GB/VI\n", methodName.c_str() );
+        (void) fprintf( log, "C[%14.7e %14.7e %14.7e] H[%14.7e %14.7e %14.7e] scale[%.1f %.1f]\n",
                    C_charge, C_radius, C_gamma, H_charge, H_radius, H_gamma,
                    bornRadiusScaleFactorsEven, bornRadiusScaleFactorsOdd);
-#endif
+    }

    GBVISoftcoreForce* forceField             = new GBVISoftcoreForce();
    for( int i = 0; i < numParticles; i++ ){
@@ -281,10 +209,10 @@ void testEnergyEthaneSwitchingFunction( int useSwitchingFunction ) {

    system.addForce(nonbonded);

-    LangevinIntegrator integrator1(0, 0.1, 0.01);
-    LangevinIntegrator integrator2(0, 0.1, 0.01);
-    Context referenceContext(system, integrator1, referencePlatform);
-    Context context(system, integrator2, platform);
+    VerletIntegrator integrator1(0.01);
+    VerletIntegrator integrator2(0.01);
+    Context referenceContext(system, integrator1,  Platform::getPlatformByName( "Reference") );
+    Context context(system, integrator2,  Platform::getPlatformByName( "Cuda") );
    
    vector<Vec3> positions(numParticles);
    positions[0] = Vec3(0.5480,    1.7661,    0.0000);
@@ -315,13 +243,15 @@ void testEnergyEthaneSwitchingFunction( int useSwitchingFunction ) {
       State state           = context.getState(State::Forces | State::Energy);
       State referenceState  = referenceContext.getState(State::Forces | State::Energy);
   
-#if PRINT_ON == 1
-       (void) fprintf( stderr, "cudaE=%14.7e refE=%14.7e\n", state.getPotentialEnergy(), referenceState.getPotentialEnergy() );
-#endif
+       
+       if( log ){
+           (void) fprintf( log, "cudaE=%14.7e refE=%14.7e\n", state.getPotentialEnergy(), referenceState.getPotentialEnergy() );
+       }
       
       // Take a small step in the direction of the energy gradient.
       
-       if( compareForcesOfTwoStates( numParticles, state, referenceState, 0.001, 0.001 ) ){
+       DoubleVector stats;
+       if( compareForcesOfTwoStates( state, referenceState, 0.001, stats, log ) ){
          ASSERT_EQUAL_TOL(0.0, 1.0, 0.01)
       }
   
@@ -336,9 +266,9 @@ void testEnergyEthaneSwitchingFunction( int useSwitchingFunction ) {
       }
       norm               = std::sqrt(norm);
   
-#if PRINT_ON == 1
-       (void) fprintf( stderr, "Fsum [%14.7e %14.7e %14.7e] norm=%14.7e\n", forceSum[0], forceSum[1], forceSum[2], norm );
-#endif
+       if( log ){
+           (void) fprintf( log, "Fsum [%14.7e %14.7e %14.7e] norm=%14.7e\n", forceSum[0], forceSum[1], forceSum[2], norm );
+       }
   
       const double delta = 1e-03;
       double step        = delta/norm;
@@ -354,10 +284,10 @@ void testEnergyEthaneSwitchingFunction( int useSwitchingFunction ) {
       double diff  = (state2.getPotentialEnergy()-state.getPotentialEnergy())/delta;
       double off   = fabs( diff - norm )/norm;
   
-#if PRINT_ON == 1
-       (void) fprintf( stderr, "%2d Energies %.8e %.8e norms[%13.7e %13.7e] deltaNorms=%13.7e delta=%.2e\n",
+       if( log ){
+           (void) fprintf( log, "%2d Energies %.8e %.8e norms[%13.7e %13.7e] deltaNorms=%13.7e delta=%.2e\n",
                           ii, state.getPotentialEnergy(), state2.getPotentialEnergy(), diff, norm, off, delta );
-#endif
+       }
   
       // See whether the potential energy changed by the expected amount.
       
@@ -372,9 +302,9 @@ void testEnergyEthaneSwitchingFunction( int useSwitchingFunction ) {
       
 //           positions[8][2] -=  static_cast<double>(ii+1)*0.1;
 //           positions[8][2] -=  0.001;
-#if PRINT_ON == 1
-           (void) fprintf( stderr, "r48=%14.6e r28=%14.6e r24=%14.6e\n", positions[8][2]-positions[4][2], positions[8][2], positions[4][2] );
-#endif
+           if( log ){
+               (void) fprintf( log, "r48=%14.6e r28=%14.6e r24=%14.6e\n", positions[8][2]-positions[4][2], positions[8][2], positions[4][2] );
+           }
       }
 #if 0
       int carbonIndex    = 1;
@@ -388,7 +318,7 @@ void testEnergyEthaneSwitchingFunction( int useSwitchingFunction ) {
          for( int kk = 0; kk < 3; kk++ ){
             dist += (positions[carbonIndex][kk] - positions[hydrogenIndex][kk] )*(positions[carbonIndex][kk] - positions[hydrogenIndex][kk]);
          }
-           (void) fprintf( stderr, "H=%d C=%d r=%14.6e\n", hydrogenIndex, carbonIndex, dist );
+           (void) fprintf( log, "H=%d C=%d r=%14.6e\n", hydrogenIndex, carbonIndex, dist );
          hydrogenIndex++;
          if( hydrogenIndex == carbonIndex ){
             hydrogenIndex++;
@@ -403,13 +333,557 @@ void testEnergyEthaneSwitchingFunction( int useSwitchingFunction ) {
   }
 }

+static GBVISoftcoreForce* copyGbviSoftcoreForce( const GBVISoftcoreForce& gbviSoftcoreForce ){
+
+    GBVISoftcoreForce* copyGbviSoftcoreForce = new GBVISoftcoreForce(gbviSoftcoreForce);
+/*
+    GBVISoftcoreForce* copyGbviSoftcoreForce = new GBVISoftcoreForce();
+
+    copyGbviSoftcoreForce->setNonbondedMethod( gbviSoftcoreForce.getNonbondedMethod() );
+
+    copyGbviSoftcoreForce->setCutoffDistance( gbviSoftcoreForce.getCutoffDistance() );
+
+    copyGbviSoftcoreForce->setSolventDielectric( gbviSoftcoreForce.getSolventDielectric() );
+    copyGbviSoftcoreForce->setSoluteDielectric( gbviSoftcoreForce.getSoluteDielectric() );
+
+    copyGbviSoftcoreForce->setBornRadiusScalingMethod( gbviSoftcoreForce.getBornRadiusScalingMethod() );
+    copyGbviSoftcoreForce->setQuinticLowerLimitFactor( gbviSoftcoreForce.getQuinticLowerLimitFactor() );
+    copyGbviSoftcoreForce->setQuinticUpperBornRadiusLimit( gbviSoftcoreForce.getQuinticUpperBornRadiusLimit() );
+
+    // particle parameters
+
+    for( unsigned int ii = 0; ii < gbviSoftcoreForce.getNumParticles(); ii++ ){
+
+        double charge;
+        double sigma;
+        double gamma;
+        double softcoreLJLambda;
+        gbviSoftcoreForce.getParticleParameters(ii, charge, sigma, gamma, softcoreLJLambda);
+        copyGbviSoftcoreForce->addParticle( charge, sigma, gamma, softcoreLJLambda);
+    }
+
+    // bonds
+
+    for( unsigned int ii = 0; ii < gbviSoftcoreForce.getNumBonds(); ii++ ){
+        int particle1, particle2;
+        double distance;
+        gbviSoftcoreForce.getBondParameters( ii, particle1, particle2, distance);
+        copyGbviSoftcoreForce->addBond( particle1, particle2, distance );
+    }
+*/
+    return copyGbviSoftcoreForce;
+}
+
+static GBVIForce* copyGbviForce( const GBVIForce& gbviForce ){
+    return new GBVIForce(gbviForce);
+}
+
+static GBSAOBCSoftcoreForce* copyGBSAOBCSoftcoreForce( const GBSAOBCSoftcoreForce& gbviSoftcoreForce ){
+    return new GBSAOBCSoftcoreForce(gbviSoftcoreForce);
+}
+
+static GBSAOBCForce* copyGbsaObcForce( const GBSAOBCForce& gbviForce ){
+    return new GBSAOBCForce(gbviForce);
+}
+
+void testGbviSoftcore( MapStringToDouble& inputArgumentMap, FILE* log ){
+
+    double lambda1                       = 1.0;
+    double lambda2                       = 1.0;
+    int nonbondedMethod                  = 0;
+    int numMolecules                     = 1;
+    int numParticlesPerMolecule          = 2;
+    int useQuinticSpline                 = 1;
+    int applyAssert                      = 1;
+    int positionPlacementMethod          = 0;
+    int serialize                        = 0;
+    double boxSize                       = 10.0;
+    double relativeTolerance             = 1.0e-04;
+
+    setDoubleFromMapStringToDouble( inputArgumentMap, "lambda1",                      lambda1 );
+    setDoubleFromMapStringToDouble( inputArgumentMap, "lambda2",                      lambda2 );
+    setDoubleFromMapStringToDouble( inputArgumentMap, "boxSize",                      boxSize );
+    double cutoffDistance                = boxSize*0.4;;
+    setDoubleFromMapStringToDouble( inputArgumentMap, "cutoffDistance",               cutoffDistance);
+    setDoubleFromMapStringToDouble( inputArgumentMap, "relativeTolerance",            relativeTolerance );
+
+    setIntFromMapStringToDouble(    inputArgumentMap, "positionPlacementMethod",      positionPlacementMethod ) ;
+    setIntFromMapStringToDouble(    inputArgumentMap, "nonbondedMethod",              nonbondedMethod );
+    setIntFromMapStringToDouble(    inputArgumentMap, "numMolecules",                 numMolecules );
+    setIntFromMapStringToDouble(    inputArgumentMap, "numParticlesPerMolecule",      numParticlesPerMolecule );
+    setIntFromMapStringToDouble(    inputArgumentMap, "serialize",                    serialize );
+   
+    if( nonbondedMethod == 2 && cutoffDistance > boxSize*0.5 ){
+        cutoffDistance = boxSize*0.5;
+    }
+
+    int numParticles                     = numMolecules*numParticlesPerMolecule;
+    int includeGbvi                      = 1;
+    double reactionFieldDielectric       = 80.0;
+
+    if( log ){
+        double particleDensity = static_cast<double>(numParticles)/(boxSize*boxSize*boxSize);
+        double particleCube    = pow( particleDensity, (-1.0/3.0) );
+      
+        (void) fprintf( log, "\n--------------------------------------------------------------------------------------\n" );
+        (void) fprintf( log, "Input arguments\n" );
+        (void) fflush( log );
+        (void) fprintf( log, "    includeGbvi                 %d\n", includeGbvi );
+        (void) fprintf( log, "    nonbondedMethod             %d\n", nonbondedMethod );
+        (void) fprintf( log, "    numParticles                %d\n", numParticles );
+        (void) fprintf( log, "    numMolecules                %d\n", numMolecules );
+        (void) fprintf( log, "    numParticlesPerMolecule     %d\n", numParticlesPerMolecule );
+        (void) fprintf( log, "    useQuinticSpline            %d\n", useQuinticSpline );
+        (void) fprintf( log, "    positionPlacementMethod     %d\n", positionPlacementMethod);
+
+#ifdef USE_SOFTCORE
+        (void) fprintf( log, "    lambda1                     %8.3f\n", lambda1 );
+        (void) fprintf( log, "    lambda2                     %8.3f\n", lambda2 );
+#endif
+        (void) fprintf( log, "    boxSize                     %8.3f\n", boxSize );
+        (void) fprintf( log, "    cutoffDistance              %8.3f\n", cutoffDistance );
+        (void) fprintf( log, "    reactionFieldDielectric     %8.3f\n", reactionFieldDielectric );
+        (void) fprintf( log, "    relativeTolerance           %8.1e\n", relativeTolerance );
+        (void) fprintf( log, "    particleDensity             %8.2e\n", particleDensity );
+        (void) fprintf( log, "    particleCube                %8.2e\n", particleCube );
+    }
+
+    // Create two systems: one with GbviSoftcoreForce NonbondedSoftcoreForce forces, and one using a CustomNonbondedForce, CustomGBVI force to implement the same interaction.
+
+    System standardSystem;
+    for (int i = 0; i < numParticles; i++) {
+        standardSystem.addParticle(1.0);
+    }
+    standardSystem.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+
+#ifdef USE_SOFTCORE
+    NonbondedSoftcoreForce* nonbondedSoftcoreForce   = new NonbondedSoftcoreForce();
+    if( nonbondedMethod == NoCutoff ){
+        nonbondedSoftcoreForce->setNonbondedMethod( NonbondedSoftcoreForce::NoCutoff );
+    } else {
+        if( nonbondedMethod == CutoffNonPeriodic ){
+            nonbondedSoftcoreForce->setNonbondedMethod( NonbondedSoftcoreForce::CutoffNonPeriodic );
+        } else {
+            nonbondedSoftcoreForce->setNonbondedMethod( NonbondedSoftcoreForce::CutoffPeriodic );
+        }
+    }
+#else
+    NonbondedForce* nonbondedSoftcoreForce = new NonbondedForce();
+    if( nonbondedMethod == NoCutoff ){
+        nonbondedSoftcoreForce->setNonbondedMethod( NonbondedForce::NoCutoff );
+    } else {
+        if( nonbondedMethod == CutoffNonPeriodic ){
+            nonbondedSoftcoreForce->setNonbondedMethod( NonbondedForce::CutoffNonPeriodic );
+        } else {
+            nonbondedSoftcoreForce->setNonbondedMethod( NonbondedForce::CutoffPeriodic );
+        }
+    }
+#endif
+    nonbondedSoftcoreForce->setCutoffDistance( cutoffDistance );
+    nonbondedSoftcoreForce->setReactionFieldDielectric( reactionFieldDielectric );
+
+#ifdef USE_SOFTCORE
+
+#if IMPLICIT_SOLVENT == GBVI_FLAG
+    GBVISoftcoreForce* gbviSoftcoreForce             = new GBVISoftcoreForce();
+    if( nonbondedMethod == NoCutoff ){
+        gbviSoftcoreForce->setNonbondedMethod( GBVISoftcoreForce::NoCutoff );
+    } else {
+        if( nonbondedMethod == CutoffNonPeriodic ){
+            gbviSoftcoreForce->setNonbondedMethod( GBVISoftcoreForce::CutoffNonPeriodic );
+        } else {
+            gbviSoftcoreForce->setNonbondedMethod( GBVISoftcoreForce::CutoffPeriodic );
+        }
+    }
+#else
+    GBSAOBCSoftcoreForce* gbviSoftcoreForce          = new GBSAOBCSoftcoreForce();
+    if( nonbondedMethod == NoCutoff ){
+        gbviSoftcoreForce->setNonbondedMethod( GBSAOBCSoftcoreForce::NoCutoff );
+    } else {
+        if( nonbondedMethod == CutoffNonPeriodic ){
+            gbviSoftcoreForce->setNonbondedMethod( GBSAOBCSoftcoreForce::CutoffNonPeriodic );
+        } else {
+            gbviSoftcoreForce->setNonbondedMethod( GBSAOBCSoftcoreForce::CutoffPeriodic );
+        }
+    }
+#endif
+
+#else
+
+#if IMPLICIT_SOLVENT == GBVI_FLAG
+    GBVIForce* gbviSoftcoreForce           = new GBVIForce();
+    if( nonbondedMethod == NoCutoff ){
+        gbviSoftcoreForce->setNonbondedMethod( GBVIForce::NoCutoff );
+    } else {
+        if( nonbondedMethod == CutoffNonPeriodic ){
+            gbviSoftcoreForce->setNonbondedMethod( GBVIForce::CutoffNonPeriodic );
+        } else {
+            gbviSoftcoreForce->setNonbondedMethod( GBVIForce::CutoffPeriodic );
+        }
+    }
+
+#else
+
+    GBSAOBCForce* gbviSoftcoreForce           = new GBSAOBCForce();
+    if( nonbondedMethod == NoCutoff ){
+        gbviSoftcoreForce->setNonbondedMethod( GBSAOBCForce::NoCutoff );
+    } else {
+        if( nonbondedMethod == CutoffNonPeriodic ){
+            gbviSoftcoreForce->setNonbondedMethod( GBSAOBCForce::CutoffNonPeriodic );
+        } else {
+            gbviSoftcoreForce->setNonbondedMethod( GBSAOBCForce::CutoffPeriodic );
+        }
+    }
+
+#endif
+
+#endif
+
+#if IMPLICIT_SOLVENT == GBVI_FLAG
+#ifdef USE_SOFTCORE
+    if( useQuinticSpline ){
+        gbviSoftcoreForce->setBornRadiusScalingMethod( GBVISoftcoreForce::QuinticSpline );
+    } else {
+        gbviSoftcoreForce->setBornRadiusScalingMethod( GBVISoftcoreForce::NoScaling );
+    }
+#else
+    if( useQuinticSpline ){
+        gbviSoftcoreForce->setBornRadiusScalingMethod( GBVIForce::QuinticSpline );
+    } else {
+        gbviSoftcoreForce->setBornRadiusScalingMethod( GBVIForce::NoScaling );
+    }
+#endif
+#endif
+
+    gbviSoftcoreForce->setSolventDielectric( 78.3 );
+    //gbviSoftcoreForce->setSolventDielectric( 1.0e+10 );
+    //gbviSoftcoreForce->setSolventDielectric( 1.0 );
+    gbviSoftcoreForce->setSoluteDielectric( 1.0 );
+    gbviSoftcoreForce->setCutoffDistance( nonbondedSoftcoreForce->getCutoffDistance( ) );
+
+    std::vector<Vec3> positions(numParticles);
+
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    PositionGenerator positionGenerator( numMolecules, numParticlesPerMolecule, boxSize );
+    if( log ){
+        positionGenerator.setLog( log );
+    }
+    if( positionPlacementMethod == 1 ){
+        positionGenerator.setPositions( PositionGenerator::SimpleGrid, sfmt, positions );
+    } else {
+        positionGenerator.setBondDistance( 0.3 );
+        positionGenerator.setPositions( PositionGenerator::Random, sfmt, positions );
+    }
+
+    // show info on particle positions
+
+    if( log ){
+        Vec3 box[2];
+        positionGenerator.getEnclosingBox( positions, box );
+        (void) fprintf( log, "Enclosing Box (in A): [%15.7e %15.7e] [%15.7e %15.7e] [%15.7e %15.7e]   [%15.7e %15.7e %15.7e]\n",
+                        box[0][0], box[1][0], box[0][1], box[1][1], box[0][2], box[1][2],
+                        (box[1][0] - box[0][0]), (box[1][1] - box[0][1]), (box[1][2] - box[0][2]) );
+
+        int showIndex                        = 5;
+        int periodicBoundaryConditions       = (nonbondedMethod == 2) ? 1 : 0;
+
+        IntVector positionIndexVector;
+        positionIndexVector.push_back( 0 );
+        positionIndexVector.push_back( static_cast<int>(positions.size())-1 );
+        //positionIndexVector.push_back( 542 );
+
+        for( unsigned int ii = 0; ii < positionIndexVector.size(); ii++ ){
+            if( positionIndexVector[ii] < positions.size() ){
+                int positionIndex = positionIndexVector[ii];
+                IntDoublePairVector sortVector;
+                positionGenerator.getSortedDistances( periodicBoundaryConditions, positionIndex, positions, sortVector );
+                (void) fprintf( log, "Min/max distance from %6d:\n    ", positionIndex );
+                for( unsigned int jj = 0; jj < sortVector.size() && jj < showIndex; jj++ ){
+                    IntDoublePair pair = sortVector[jj];
+                    (void) fprintf( log, "[%6d %15.7e] ", pair.first, pair.second);
+                }
+                (void) fprintf( log, "\n    " );
+                for( unsigned int jj = (sortVector.size() - showIndex); jj < sortVector.size() && jj >= 0; jj++ ){
+                    IntDoublePair pair = sortVector[jj];
+                    (void) fprintf( log, "[%6d %15.7e] ", pair.first, pair.second);
+                }
+                (void) fprintf( log, "\n" );
+            }
+        }
+        IntIntPairVector pairs;
+        pairs.push_back( IntIntPair( 732, 0 ) );
+        pairs.push_back( IntIntPair( 732, 1 ) );
+        pairs.push_back( IntIntPair( 732, 2 ) );
+        pairs.push_back( IntIntPair( 732, 3 ) );
+        pairs.push_back( IntIntPair( 732, 4 ) );
+        for( IntIntPairVectorCI ii = pairs.begin(); ii != pairs.end(); ii++ ){
+            if( ii->first < positions.size() && ii->second < positions.size() ){
+                 double d = positionGenerator.getDistance( ii->first, ii->second, positions );
+                 (void) fprintf( log, "Distance %6d %6d  %15.7e d2=%15.7e\n", ii->first, ii->second,  d, d*d );
+            }
+        }
+    }    
+
+    const int numberOfParameters             = 5;
+
+    const int ChargeIndex                    = 0;
+    const int SigmaIndex                     = 1;
+    const int EpsIndex                       = 2;
+    const int GammaIndex                     = 3;
+    const int LambdaIndex                    = 4;
+
+    std::vector<double> parameterLowerBound( numberOfParameters, 0.0 );
+
+    double fixedCharge                       = 1.0;
+    parameterLowerBound[ChargeIndex]         = fixedCharge;  // charge
+    parameterLowerBound[SigmaIndex]          = 0.1;          // sigma
+    parameterLowerBound[EpsIndex]            = 0.5;          // eps
+    parameterLowerBound[GammaIndex]          = 0.1;          // gamma
+    parameterLowerBound[LambdaIndex]         = lambda1;      // lambda
+
+    std::vector<double> parameterUpperBound( parameterLowerBound );
+    parameterUpperBound[ChargeIndex]         = fixedCharge;  // charge
+    parameterUpperBound[SigmaIndex]          = 0.3;          // sigma
+    parameterUpperBound[EpsIndex]            = 40.0;         // eps
+    parameterUpperBound[GammaIndex]          = 40.0;         // gamma
+
+#if IMPLICIT_SOLVENT == OBC_FLAG
+    parameterLowerBound[GammaIndex]          = 0.1;          // overlap factor
+    parameterUpperBound[GammaIndex]          = 1.5;        
+#endif
+
+    std::vector<double> parameters( numberOfParameters );
+    double charge = fixedCharge;
+
+    for( int ii = 0; ii < numMolecules; ii++) {
+
+        charge       *= -1.0;
+
+        double lambda =  ii < (numMolecules/2) ? lambda1 : lambda2;
+        randomizeParameters( parameterLowerBound, parameterUpperBound, sfmt, parameters );
+
+#ifdef USE_SOFTCORE
+        nonbondedSoftcoreForce->addParticle(   charge,  parameters[SigmaIndex],  parameters[EpsIndex],    lambda );
+        gbviSoftcoreForce->addParticle(        charge,  parameters[SigmaIndex],  parameters[GammaIndex],  lambda );
+#else
+        nonbondedSoftcoreForce->addParticle(   charge,  parameters[SigmaIndex],  parameters[EpsIndex] );
+        gbviSoftcoreForce->addParticle(        charge,  parameters[SigmaIndex],  parameters[GammaIndex] );
+#endif
+
+        int baseParticleIndex                    = ii*numParticlesPerMolecule;
+        for( int jj = 1; jj < numParticlesPerMolecule; jj++) {
+
+            // alternate charges
+
+            charge *= -1.0;
+
+            randomizeParameters( parameterLowerBound, parameterUpperBound, sfmt, parameters );
+
+#ifdef USE_SOFTCORE
+            nonbondedSoftcoreForce->addParticle(   charge,  parameters[SigmaIndex],  parameters[EpsIndex],    lambda );
+            gbviSoftcoreForce->addParticle(        charge,  parameters[SigmaIndex],  parameters[GammaIndex],  lambda );
+#else
+            nonbondedSoftcoreForce->addParticle(   charge,  parameters[SigmaIndex],  parameters[EpsIndex] );
+            gbviSoftcoreForce->addParticle(        charge,  parameters[SigmaIndex],  parameters[GammaIndex] );
+#endif
+
+            nonbondedSoftcoreForce->addException( baseParticleIndex, baseParticleIndex+jj, 0.0f, 1.0, 0.0f );
+
+#if IMPLICIT_SOLVENT == GBVI_FLAG
+            double bondDistance  = positionGenerator.getDistance( baseParticleIndex, baseParticleIndex+jj, positions );
+            gbviSoftcoreForce->addBond( baseParticleIndex, baseParticleIndex+jj,  bondDistance );
+#endif
+        }
+
+        // alternate charge if numParticlesPerMolecule is odd
+
+        if( (numParticlesPerMolecule % 2) ){
+            charge *= -1.0;
+        }
+    }
+
+    standardSystem.addForce(nonbondedSoftcoreForce);
+    if( includeGbvi ){
+        standardSystem.addForce(gbviSoftcoreForce);
+    }
+
+    // copy system and forces
+
+    System* systemCopy = copySystem( standardSystem );
+
+#ifdef USE_SOFTCORE
+    NonbondedSoftcoreForce* nonbondedSoftcoreForceCopy;
+    nonbondedSoftcoreForceCopy = copyNonbondedSoftcoreForce( *nonbondedSoftcoreForce );
+#else
+    NonbondedForce* nonbondedSoftcoreForceCopy;
+    nonbondedSoftcoreForceCopy = copyNonbondedForce( *nonbondedSoftcoreForce );
+#endif
+    systemCopy->addForce( nonbondedSoftcoreForceCopy );
+    std::stringstream baseFileName;
+
+    if( includeGbvi ){
+#ifdef USE_SOFTCORE
+
+#if IMPLICIT_SOLVENT == GBVI_FLAG
+        GBVISoftcoreForce* gBVISoftcoreForceCopy  = copyGbviSoftcoreForce( *gbviSoftcoreForce );
+        baseFileName  << "GBVISoftcore";
+#endif
+#if IMPLICIT_SOLVENT == OBC_FLAG
+        baseFileName  << "GBSAObcSoftcore";
+        GBSAOBCSoftcoreForce* gBVISoftcoreForceCopy       = copyGBSAOBCSoftcoreForce( *gbviSoftcoreForce );
+#endif
+        baseFileName  << "_lbda" << std::fixed << setprecision(2) << lambda2;
+
+#else
+
+#if IMPLICIT_SOLVENT == GBVI_FLAG
+        GBVIForce* gBVISoftcoreForceCopy          = copyGbviForce( *gbviSoftcoreForce );
+        baseFileName  << "Gbvi";
+#endif
+#if IMPLICIT_SOLVENT == OBC_FLAG
+        GBSAOBCForce* gBVISoftcoreForceCopy       = copyGbsaObcForce( *gbviSoftcoreForce );
+        baseFileName  << "GBSAOBC";
+#endif
+
+#endif
+        systemCopy->addForce( gBVISoftcoreForceCopy );
+    }
+
+    // perform comparison
+
+    std::stringstream idString;
+    idString << "Nb " << nonbondedMethod << " l2 " << std::fixed << setprecision(2) << lambda2;
+    runSystemComparisonTest( standardSystem, *systemCopy, "Cuda", "Reference", positions, inputArgumentMap, idString.str(), log );
+
+    // serialize
+
+    baseFileName  << "_N"     << positions.size();
+    baseFileName  << "_Nb"    << nonbondedMethod;
+    serializeSystemAndPositions( standardSystem, positions, baseFileName.str(), log);
+
+    delete systemCopy;
+
+}
+
 int main() {
+
    try {
-        testSingleParticle();
-        testEnergyEthaneSwitchingFunction( 0 );
-        testEnergyEthaneSwitchingFunction( 1 );
+
+        registerFreeEnergyCudaKernelFactories( );
+
+        VectorOfMapStringToDouble vectorOfMapStringToDouble;
+        MapStringToDouble inputArgumentMap;
+        MapStringToDoubleVector generativeArgumentMaps;
+        //FILE* log = stderr;
+        FILE* log = NULL;
+/*
+        testSingleParticle( log );
+
+        testEnergyEthaneSwitchingFunction( 0, log );
+        testEnergyEthaneSwitchingFunction( 1, log );
+*/
+
+        inputArgumentMap["lambda2"]                         = 1.0;
+        inputArgumentMap["nonbondedMethod"]                 = 0;
+        inputArgumentMap["numMolecules"]                    = 10;
+        inputArgumentMap["boxSize"]                         = 5.0;
+        inputArgumentMap["positionPlacementMethod"]         = 0;
+        inputArgumentMap["cutoffDistance"]                  = 0.3*inputArgumentMap["boxSize"];
+        //inputArgumentMap["cutoffDistance"]                  = 1.0;
+        inputArgumentMap["relativeTolerance"]               = 5.0e-04;
+        inputArgumentMap["serialize"]                       = 1;
+        //inputArgumentMap["numParticlesPerMolecule"]         = 2;
+
+#ifdef USE_SOFTCORE
+        DoubleVector lamda2;
+        lamda2.push_back( 1.0 );
+        lamda2.push_back( 0.5 );
+        lamda2.push_back( 0.0 );
+        if( lamda2.size() > 0 ){
+            generativeArgumentMaps["lambda2"] = lamda2;
+            inputArgumentMap["lambda2"]       = lamda2[0];
        }   
-    catch(const exception& e) {
+#endif
+
+        DoubleVector numberOfMolecules;
+        numberOfMolecules.push_back( 10 );
+        numberOfMolecules.push_back( 100 );
+        numberOfMolecules.push_back( 1000 );
+        //numberOfMolecules.push_back( 2000 );
+        //numberOfMolecules.push_back( 4000 );
+        //numberOfMolecules.push_back( 8000 );
+        if( numberOfMolecules.size() > 0 ){
+            generativeArgumentMaps["numMolecules"] = numberOfMolecules;
+            inputArgumentMap["numMolecules"]       = numberOfMolecules[0];
+        }   
+
+        DoubleVector nonbondedMethod;
+        nonbondedMethod.push_back( 0 );
+        nonbondedMethod.push_back( 1 );
+        nonbondedMethod.push_back( 2 );
+        if( nonbondedMethod.size() > 0 ){
+            generativeArgumentMaps["nonbondedMethod"] = nonbondedMethod;
+            inputArgumentMap["nonbondedMethod"]       = nonbondedMethod[0];
+        }
+
+        vectorOfMapStringToDouble.push_back( inputArgumentMap );
+        generateInputArgumentMapsFromStringVectors( generativeArgumentMaps, vectorOfMapStringToDouble ); 
+
+        // big box/many particle tests
+
+        //bool bigBox = true;
+        bool bigBox = false;
+        if( bigBox ){
+            MapStringToDouble inputArgumentMapBig;
+            VectorOfMapStringToDouble vectorOfMapStringToDoubleBig;
+            inputArgumentMapBig["lambda2"]                         = 1.0;
+            inputArgumentMapBig["nonbondedMethod"]                 = 1;
+            inputArgumentMapBig["numMolecules"]                    = 10;
+            inputArgumentMapBig["boxSize"]                         = 20.0;
+            inputArgumentMapBig["relativeTolerance"]               = 6.0e-04;
+            vectorOfMapStringToDoubleBig.push_back( inputArgumentMapBig );
+            //MapStringToDoubleVector generativeArgumentMapsBig;
+
+            numberOfMolecules.resize( 0 );
+            numberOfMolecules.push_back( 4000 );
+            generativeArgumentMaps["numMolecules"] = numberOfMolecules;
+
+            nonbondedMethod.resize( 0 );
+            nonbondedMethod.push_back( 1 );
+            nonbondedMethod.push_back( 2 );
+            generativeArgumentMaps["nonbondedMethod"] = nonbondedMethod;
+            generateInputArgumentMapsFromStringVectors( generativeArgumentMaps, vectorOfMapStringToDoubleBig ); 
+            vectorOfMapStringToDouble.resize( 0 );
+            vectorOfMapStringToDouble.insert( vectorOfMapStringToDouble.end(), vectorOfMapStringToDoubleBig.begin(), vectorOfMapStringToDoubleBig.end() );
+        }
+
+        if( log ){
+            MapStringToInt exclude;
+            exclude["lambda1"]                 = 1;
+            exclude["numParticlesPerMolecule"] = 1;
+            std::stringstream outputStream;
+            std::sort( vectorOfMapStringToDouble.begin(), vectorOfMapStringToDouble.end(), TestMapSortPredicate);
+            StringVector printOrder;
+            printOrder.push_back( "numMolecules" );
+            printOrder.push_back( "nonbondedMethod" );
+            printOrder.push_back( "lambda2" );
+            printOrder.push_back( "boxSize" );
+            for( unsigned int kk = 0; kk < vectorOfMapStringToDouble.size(); kk++ ){
+                streamArgumentMapOneLine( vectorOfMapStringToDouble[kk], exclude, printOrder, kk, outputStream );
+            }
+            (void) fprintf( log, "Initial argument maps: %u\n%s", static_cast<unsigned int>(vectorOfMapStringToDouble.size()), outputStream.str().c_str() );
+        }
+
+        // run tests
+
+        for( unsigned int kk = 0; kk < vectorOfMapStringToDouble.size(); kk++ ){
+            testGbviSoftcore( vectorOfMapStringToDouble[kk], log );
+            sleep(2);
+        }
+
+    } catch(const exception& e) {
        cout << "exception: " << e.what() << endl;
        return 1;
    }

--- a/plugins/freeEnergy/platforms/cuda/tests/TestCudaLJSoftcoreForce.cpp
+++ b/plugins/freeEnergy/platforms/cuda/tests/TestCudaLJSoftcoreForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests all the different force terms in the reference implementation of CustomGBForce.
+ */
+
+#include "../../../tests/AssertionUtilities.h"
+
+#include "sfmt/SFMT.h"
+#include "openmm/Context.h"
+#include "openmm/CustomBondForce.h"
+#include "openmm/CustomNonbondedForce.h"
+
+#include "openmm/NonbondedForce.h"
+#include "openmm/NonbondedSoftcoreForce.h"
+
+#include "openmm/System.h"
+#include "openmm/VerletIntegrator.h"
+#include <iostream>
+#include <vector>
+#include <algorithm>
+
+extern "C" void registerFreeEnergyCudaKernelFactories();
+
+using namespace OpenMM;
+using namespace std;
+
+const double TOL = 1e-4;
+
+static const int NoCutoff          = 0;
+static const int CutoffNonPeriodic = 1;
+static const int CutoffPeriodic    = 2;
+
+void testNonbondedSoftcore( double lambda1, double lambda2, int nonbondedMethod, FILE* log  ){
+
+    const int numMolecules               = 70;
+    const int numParticles               = numMolecules*2;
+    const double boxSize                 = 10.0;
+    const double reactionFieldDielectric = 80.0;
+    const double cutoffDistance          = 0.4*boxSize;
+
+    // Create two systems: one with a NonbondedSoftcoreForce, and one using a CustomNonbondedForce to implement the same interaction.
+
+    System standardSystem;
+    System customSystem;
+    for (int i = 0; i < numParticles; i++) {
+        standardSystem.addParticle(1.0);
+        customSystem.addParticle(1.0);
+    }
+    standardSystem.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    customSystem.setDefaultPeriodicBoxVectors(  Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+
+    NonbondedSoftcoreForce* nonbondedSoftcoreForce   = new NonbondedSoftcoreForce();
+    CustomNonbondedForce* customNonbonded;
+    CustomBondForce* customBond;
+    if( nonbondedMethod == NoCutoff ){
+
+        nonbondedSoftcoreForce->setNonbondedMethod( NonbondedSoftcoreForce::NoCutoff );
+
+        customNonbonded          = new CustomNonbondedForce("lambda*4*eps*(dem^2-dem)+138.935456*q/r;"
+                                                            "q=q1*q2;"
+                                                            "dem=1.0/(soft+rsig);"
+                                                            "rsig=(r/sigma)^6;"
+                                                            "rsig=(r/sigma)^6;"
+                                                            "soft=0.5*(1.0-lambda);"
+                                                            "sigma=0.5*(sigma1+sigma2);"
+                                                            "eps=sqrt(eps1*eps2);"
+                                                            "lambda=min(lambda1,lambda2)");
+
+        customNonbonded->setNonbondedMethod( CustomNonbondedForce::NoCutoff );
+
+        customBond               = new CustomBondForce("lambda*4*eps*(dem^2-dem)+138.935456*q/r;"
+                                                       "dem=1.0/(soft+rsig);"
+                                                       "rsig=(r/sigma)^6;"
+                                                       "soft=0.5*(1.0-lambda)");
+
+    } else {
+
+        nonbondedSoftcoreForce->setCutoffDistance( cutoffDistance );
+        nonbondedSoftcoreForce->setReactionFieldDielectric( reactionFieldDielectric );
+        if( nonbondedMethod == CutoffNonPeriodic ){
+            nonbondedSoftcoreForce->setNonbondedMethod( NonbondedSoftcoreForce::CutoffNonPeriodic );
+        } else {
+            nonbondedSoftcoreForce->setNonbondedMethod( NonbondedSoftcoreForce::CutoffPeriodic );
+        }
+
+        customNonbonded          = new CustomNonbondedForce("lambda*4*eps*(dem^2-dem)+138.935456*q*(1.0/r+(krf*r*r)-crf);"
+                                                            "q=q1*q2;"
+                                                            "dem=1.0/(soft+rsig);"
+                                                            "rsig=(r/sigma)^6;"
+                                                            "rsig=(r/sigma)^6;"
+                                                            "soft=0.5*(1.0-lambda);"
+                                                            "sigma=0.5*(sigma1+sigma2);"
+                                                            "eps=sqrt(eps1*eps2);"
+                                                            "lambda=min(lambda1,lambda2)");
+
+        customBond               = new CustomBondForce("withinCutoff*(lambda*4*eps*(dem^2-dem)+138.935456*q*(1.0/r+(krf*r*r)-crf));"
+                                                       "withinCutoff=step(cutoff-r);"
+                                                       "dem=1.0/(soft+rsig);"
+                                                       "rsig=(r/sigma)^6;"
+                                                       "soft=0.5*(1.0-lambda)");
+ 
+        customNonbonded->setCutoffDistance( cutoffDistance );
+        if( nonbondedMethod == CutoffNonPeriodic ){
+            customNonbonded->setNonbondedMethod( CustomNonbondedForce::CutoffNonPeriodic );
+        } else {
+            customNonbonded->setNonbondedMethod( CustomNonbondedForce::CutoffPeriodic );
+        }
+
+        double eps2               = (reactionFieldDielectric - 1.0)/(2.0*reactionFieldDielectric+1.0);
+        double kValue             = eps2/(cutoffDistance*cutoffDistance*cutoffDistance);
+        customNonbonded->addGlobalParameter("krf", kValue );
+
+        customBond->addGlobalParameter("krf", kValue );
+
+        double cValue             = (1.0/cutoffDistance)*(3.0*reactionFieldDielectric)/(2.0*reactionFieldDielectric + 1.0); 
+        customNonbonded->addGlobalParameter("crf", cValue );
+        customBond->addGlobalParameter("crf", cValue );
+        customBond->addGlobalParameter("cutoff", cutoffDistance );
+    }
+
+    customNonbonded->addPerParticleParameter("q");
+    customNonbonded->addPerParticleParameter("sigma");
+    customNonbonded->addPerParticleParameter("eps");
+    customNonbonded->addPerParticleParameter("lambda");
+
+    customBond->addPerBondParameter("q");
+    customBond->addPerBondParameter("sigma");
+    customBond->addPerBondParameter("eps");
+    customBond->addPerBondParameter("lambda");
+
+    vector<Vec3> positions(numParticles);
+    vector<Vec3> velocities(numParticles);
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    vector<double> params(4);
+
+    // periodic boundary conditions not possible w/ CustomBond?
+
+    int includeExceptions = nonbondedMethod != NoCutoff ? 0 : 1;
+
+    for (int i = 0; i < numMolecules; i++) {
+        if (i < numMolecules/2) {
+
+            double charge = 1.0;
+
+            double sigma1 = 0.2;
+            double sigma2 = 0.2;
+
+            double eps1   = 0.5;
+            double eps2   = 0.5;
+//eps1 = eps2 = 0.0;
+            nonbondedSoftcoreForce->addParticle( charge, sigma1, eps1, lambda1);
+            nonbondedSoftcoreForce->addParticle(-charge, sigma2, eps2, lambda1);
+
+            params[0] = charge;
+            params[1] = sigma1;
+            params[2] = eps1;
+            params[3] = lambda1;
+
+            customNonbonded->addParticle(params);
+
+            params[0] = -charge;
+            params[1] = sigma2;
+            params[2] = eps2;
+            customNonbonded->addParticle(params);
+
+            if( includeExceptions && i && ((i%4) == 0) ){
+                vector<double> bondParams(4);
+                nonbondedSoftcoreForce->addException(i-4, i, charge*charge, sigma1, eps1, false, lambda1);
+                customNonbonded->addExclusion( i-4,i);
+                bondParams[0] = charge*charge;
+                bondParams[1] = sigma1;
+                bondParams[2] = eps1;
+                bondParams[3] = lambda1;
+                customBond->addBond(i-4,i, bondParams );
+            }
+
+        } else {
+
+            double charge = 1.0;
+
+            double sigma1 = 0.2;
+            double sigma2 = 0.1;
+
+            double eps1   = 0.8;
+            double eps2   = 0.8;
+
+//eps1 = eps2 = 0.0;
+            nonbondedSoftcoreForce->addParticle( charge, sigma1, eps1, lambda2);
+            nonbondedSoftcoreForce->addParticle(-charge, sigma2, eps2, lambda2);
+
+            params[0] = charge;
+            params[1] = sigma1;
+            params[2] = eps1;
+            params[3] = lambda2;
+            customNonbonded->addParticle(params);
+
+            params[0] = -charge;
+            params[1] = sigma2;
+            params[2] = eps2;
+            customNonbonded->addParticle(params);
+
+            if( includeExceptions && i && ((i%4) == 0) ){
+
+                vector<double> bondParams(4);
+                nonbondedSoftcoreForce->addException(i-4, i, charge*charge, sigma1, eps1, false, lambda2);
+                customNonbonded->addExclusion( i-4,i);
+                bondParams[0] = charge*charge;
+                bondParams[1] = sigma1;
+                bondParams[2] = eps1;
+                bondParams[3] = lambda2;
+                customBond->addBond(i-4,i, bondParams );
+            }
+        }
+
+        positions[2*i]    = Vec3(boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt));
+        positions[2*i+1]  = Vec3(positions[2*i][0]+1.0, positions[2*i][1], positions[2*i][2]);
+        velocities[2*i]   = Vec3(genrand_real2(sfmt), genrand_real2(sfmt), genrand_real2(sfmt));
+        velocities[2*i+1] = Vec3(genrand_real2(sfmt), genrand_real2(sfmt), genrand_real2(sfmt));
+
+    }
+
+    standardSystem.addForce(nonbondedSoftcoreForce);
+    customSystem.addForce(customNonbonded);
+    customSystem.addForce(customBond);
+
+    VerletIntegrator integrator1(0.01);
+    VerletIntegrator integrator2(0.01);
+
+    Context context1(standardSystem, integrator1, Platform::getPlatformByName( "Cuda"));
+    context1.setPositions(positions);
+    context1.setVelocities(velocities);
+
+    State state1 = context1.getState(State::Forces | State::Energy);
+
+    Context context2(customSystem, integrator2, Platform::getPlatformByName( "Reference"));
+    context2.setPositions(positions);
+    context2.setVelocities(velocities);
+    State state2 = context2.getState(State::Forces | State::Energy);
+
+    double diff = 0.0;
+    static int previousNonbondedMethod = -1;
+    if( state1.getPotentialEnergy() - state2.getPotentialEnergy() != 0.0 ){
+        diff = fabs( state1.getPotentialEnergy() - state2.getPotentialEnergy() )/( fabs( state1.getPotentialEnergy() ) + fabs( state2.getPotentialEnergy() ) );
+    }
+    if( previousNonbondedMethod != nonbondedMethod ){
+        if( log ){
+            (void) fprintf( log, "\n" );
+        }
+        previousNonbondedMethod = nonbondedMethod;
+    }
+
+    ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-4);
+    double maxDiff   = -1.0;
+    int maxDiffIndex = -1;
+    for (int i = 0; i < numParticles; i++) {
+        Vec3 f1     = state1.getForces()[i];
+        Vec3 f2     = state2.getForces()[i];
+
+        double f1N  = sqrt( (f1[0]*f1[0]) + (f1[1]*f1[1]) + (f1[2]*f1[2]) );
+        double f2N  = sqrt( (f2[0]*f2[0]) + (f2[1]*f2[1]) + (f2[2]*f2[2]) );
+
+        double diff = (f1[0]-f2[0])*(f1[0]-f2[0]) +
+                      (f1[1]-f2[1])*(f1[1]-f2[1]) +
+                      (f1[2]-f2[2])*(f1[2]-f2[2]);
+        if( f1N > 0.0 || f1N > 0.0 ){
+            diff        = 2.0*sqrt( diff )/(f1N + f2N);
+        }
+        if( diff > maxDiff ){
+            maxDiff      = diff;
+            maxDiffIndex = i;
+        }
+/*
+        (void) fprintf( log, "%4d %15.7e [%15.7e %15.7e %15.7e]  [%15.7e %15.7e %15.7e] \n", i, diff,
+                        f1[0], f1[1],f1[2], f2[0], f2[1],f2[2] );
+*/
+//        ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-4);
+         ASSERT( diff < 1.0e-3);
+    }
+    
+    if( log ){
+        (void) fprintf( log, "%d %d %10.1f %10.1f  %15.7e %15.7e %15.7e maxFDiff=%15.7e %d\n",
+                        nonbondedMethod, includeExceptions, lambda1, lambda2, diff,
+                        state1.getPotentialEnergy(), state2.getPotentialEnergy(), maxDiff, maxDiffIndex); fflush( log );
+    }
+
+}
+
+int main() {
+
+    try {
+
+        registerFreeEnergyCudaKernelFactories( );
+
+        // test various combinations of lambdas and boundary conditions/cutoffs
+        FILE* log = NULL;
+        testNonbondedSoftcore( 1.0, 1.0 , NoCutoff, log );
+        testNonbondedSoftcore( 1.0, 0.0 , NoCutoff, log );
+        testNonbondedSoftcore( 1.0, 0.5 , NoCutoff, log );
+        testNonbondedSoftcore( 0.0, 0.0 , NoCutoff, log );
+
+        testNonbondedSoftcore( 1.0, 1.0 , CutoffNonPeriodic, log );
+        testNonbondedSoftcore( 1.0, 0.0 , CutoffNonPeriodic, log );
+        testNonbondedSoftcore( 1.0, 0.5 , CutoffNonPeriodic, log );
+        testNonbondedSoftcore( 0.0, 0.0 , CutoffNonPeriodic, log );
+
+
+        testNonbondedSoftcore( 1.0, 1.0 , CutoffPeriodic, log );
+        testNonbondedSoftcore( 1.0, 0.0 , CutoffPeriodic, log );
+        testNonbondedSoftcore( 1.0, 0.5 , CutoffPeriodic, log );
+        testNonbondedSoftcore( 0.0, 0.0 , CutoffPeriodic, log );
+
+    } catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
+
--- a/plugins/freeEnergy/platforms/cuda/tests/TestCudaOBCSoftcoreForce.cpp
+++ b/plugins/freeEnergy/platforms/cuda/tests/TestCudaOBCSoftcoreForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the reference implementation of GBVIForce.
+ */
+
+#include "TestCudaSoftcoreForce.h"
+
+//#define USE_SOFTCORE
+//#define IMPLICIT_SOLVENT GBVI
+//#define IMPLICIT_SOLVENT OBC
+
+#define OBC_FLAG  1
+#define GBVI_FLAG 2
+
+#include "openmm/GBVIForce.h"
+#include "openmm/GBSAOBCForce.h"
+#include "openmm/NonbondedForce.h"
+
+#ifdef USE_SOFTCORE
+#include "openmm/GBVISoftcoreForce.h"
+#include "openmm/GBSAOBCSoftcoreForce.h"
+#include "openmm/NonbondedSoftcoreForce.h"
+#endif
+
+#include <iomanip>
+
+void testSingleParticle( FILE* log ) {
+
+    System system;
+    system.addParticle(2.0);
+    VerletIntegrator integrator(0.01);
+
+    GBVISoftcoreForce* forceField = new GBVISoftcoreForce;
+
+    double charge         = 1.0;
+    double radius         = 0.15;
+    double gamma          = 1.0;
+    forceField->addParticle(charge, radius, gamma);
+    system.addForce(forceField);
+
+    NonbondedSoftcoreForce* nonbonded = new NonbondedSoftcoreForce();
+    nonbonded->setNonbondedMethod(NonbondedSoftcoreForce::NoCutoff);
+    nonbonded->addParticle( charge, 1.0, 0.0);
+    system.addForce(nonbonded);
+
+    Context context(system, integrator, Platform::getPlatformByName( "Cuda") );
+    vector<Vec3> positions(1);
+    positions[0] = Vec3(0, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Energy);
+
+    double bornRadius     = radius; 
+    double eps0           = EPSILON0;
+    double tau            = (1.0/forceField->getSoluteDielectric()-1.0/forceField->getSolventDielectric());
+
+    double bornEnergy     = (-charge*charge/(8*PI_M*eps0))*tau/bornRadius;
+    double nonpolarEnergy = -gamma*tau*std::pow( radius/bornRadius, 3.0);
+
+    double expectedE      = (bornEnergy+nonpolarEnergy); 
+    double obtainedE      = state.getPotentialEnergy(); 
+    double diff           = fabs( obtainedE - expectedE );
+    if( log ){
+        (void) fprintf( log, "testSingleParticle expected=%14.6e obtained=%14.6e diff=%14.6e breakdown:[%14.6e %14.6e]\n",
+                        expectedE, obtainedE, diff, bornEnergy, nonpolarEnergy );
+    }
+    ASSERT_EQUAL_TOL((bornEnergy+nonpolarEnergy), state.getPotentialEnergy(), 0.01);
+}
+
+void testEnergyEthaneSwitchingFunction( int useSwitchingFunction, FILE* log ) {
+
+    std::string methodName = "testEnergyEthaneSwitchingFunction";
+
+    System system;
+    const int numParticles = 8;
+    for( int i = 0; i < numParticles; i++ ){
+       system.addParticle(1.0);
+    }
+
+    double C_HBondDistance   = 0.1097;
+    double C_CBondDistance   = 0.1504;
+
+    NonbondedSoftcoreForce* nonbonded = new NonbondedSoftcoreForce();
+    nonbonded->setNonbondedMethod(NonbondedSoftcoreForce::NoCutoff);
+
+    double C_radius, C_gamma, C_charge, H_radius, H_gamma, H_charge;
+
+    int AM1_BCC = 1;
+    H_charge    = -0.053;
+    C_charge    = -3.0*H_charge;
+    if( AM1_BCC ){
+       C_radius =  0.180;
+//C_radius =  0.360;
+       C_gamma  = -0.2863;
+       C_gamma  =  1.0;
+       H_radius =  0.125;
+//H_radius =  0.25;
+       H_gamma  =  0.2437;
+       H_gamma  =  1.0;
+//H_charge = C_charge = 0.0;
+//H_gamma = C_gamma = 0.0;
+    } else {
+       C_radius =  0.215;
+       C_gamma  = -1.1087;
+       H_radius =  0.150;
+       H_gamma  =  0.1237;
+    }
+
+    // for ethane all Coulomb forces are excluded since all atoms 3 or
+    // fewer bonds away from all other atoms -- is this true for H's on
+    // difference carbons? -- should be computed in 14 ixn 
+  
+    double bornRadiusScaleFactorsEven = 0.5;
+    //double bornRadiusScaleFactorsEven = 1.0;
+    //double bornRadiusScaleFactorsOdd  = 0.5;
+    double bornRadiusScaleFactorsOdd  = 1.0;
+    if( log ){
+        (void) fprintf( log, "%s: Applying GB/VI\n", methodName.c_str() );
+        (void) fprintf( log, "C[%14.7e %14.7e %14.7e] H[%14.7e %14.7e %14.7e] scale[%.1f %.1f]\n",
+                    C_charge, C_radius, C_gamma, H_charge, H_radius, H_gamma,
+                    bornRadiusScaleFactorsEven, bornRadiusScaleFactorsOdd);
+    }
+
+    GBVISoftcoreForce* forceField             = new GBVISoftcoreForce();
+    for( int i = 0; i < numParticles; i++ ){
+       forceField->addParticle( H_charge, H_radius, H_gamma, (i%2) ? bornRadiusScaleFactorsOdd : bornRadiusScaleFactorsEven);
+       nonbonded->addParticle(  H_charge, H_radius, 0.0);
+    }
+
+    forceField->setParticleParameters( 1, C_charge, C_radius, C_gamma, bornRadiusScaleFactorsOdd);
+    nonbonded->setParticleParameters(  1, C_charge, C_radius, 0.0);
+
+    forceField->setParticleParameters( 4, C_charge, C_radius, C_gamma, bornRadiusScaleFactorsEven);
+    nonbonded->setParticleParameters(  4, C_charge, C_radius, 0.0);
+
+//       forceField->setParticleParameters( 8, C_charge, (C_radius+0.5), C_gamma, bornRadiusScaleFactorsEven);
+//       nonbonded->setParticleParameters(  8, C_charge, C_radius, 0.0);
+
+    if( useSwitchingFunction ){
+       forceField->setBornRadiusScalingMethod( GBVISoftcoreForce::QuinticSpline );
+    } else {
+       forceField->setBornRadiusScalingMethod( GBVISoftcoreForce::NoScaling );
+    }
+
+    forceField->addBond( 0, 1, C_HBondDistance );
+    forceField->addBond( 2, 1, C_HBondDistance );
+    forceField->addBond( 3, 1, C_HBondDistance );
+    forceField->addBond( 1, 4, C_CBondDistance );
+    forceField->addBond( 5, 4, C_HBondDistance );
+    forceField->addBond( 6, 4, C_HBondDistance );
+    forceField->addBond( 7, 4, C_HBondDistance );
+
+    std::vector<pair<int, int> > bonds;
+    std::vector<double> bondDistances;
+
+    bonds.push_back(pair<int, int>(0, 1));
+    bondDistances.push_back( C_HBondDistance );
+
+    bonds.push_back(pair<int, int>(2, 1));
+    bondDistances.push_back( C_HBondDistance );
+
+    bonds.push_back(pair<int, int>(3, 1));
+    bondDistances.push_back( C_HBondDistance );
+
+    bonds.push_back(pair<int, int>(1, 4));
+    bondDistances.push_back( C_CBondDistance );
+
+    bonds.push_back(pair<int, int>(5, 4));
+    bondDistances.push_back( C_HBondDistance );
+
+    bonds.push_back(pair<int, int>(6, 4));
+    bondDistances.push_back( C_HBondDistance );
+
+    bonds.push_back(pair<int, int>(7, 4));
+    bondDistances.push_back( C_HBondDistance );
+
+    nonbonded->createExceptionsFromBonds(bonds, 0.0, 0.0);
+
+    system.addForce(forceField);
+
+    system.addForce(nonbonded);
+
+    VerletIntegrator integrator1(0.01);
+    VerletIntegrator integrator2(0.01);
+    Context referenceContext(system, integrator1,  Platform::getPlatformByName( "Reference") );
+    Context context(system, integrator2,  Platform::getPlatformByName( "Cuda") );
+    
+    vector<Vec3> positions(numParticles);
+    positions[0] = Vec3(0.5480,    1.7661,    0.0000);
+    positions[1] = Vec3(0.7286,    0.8978,    0.6468);
+    positions[2] = Vec3(0.4974,    0.0000,    0.0588);
+    positions[3] = Vec3(0.0000,    0.9459,    1.4666);
+    positions[4] = Vec3(2.1421,    0.8746,    1.1615);
+    positions[5] = Vec3(2.3239,    0.0050,    1.8065);
+    positions[6] = Vec3(2.8705,    0.8295,    0.3416);
+    positions[7] = Vec3(2.3722,    1.7711,    1.7518);
+
+    //positions[8] = Vec3(2.1421,    0.8746,    2.1615);
+
+    vector<Vec3> originalPositions(numParticles);
+    for( int ii = 0; ii < numParticles; ii++ ){
+       originalPositions[ii][0] = positions[ii][0];
+       originalPositions[ii][1] = positions[ii][1];
+       originalPositions[ii][2] = positions[ii][2];
+    }
+
+    int tries                = 1;
+    double positionIncrement = 0.15;
+    for( int ii = 0; ii < tries; ii++ ){
+
+       context.setPositions(positions);
+       referenceContext.setPositions(positions);
+   
+       State state           = context.getState(State::Forces | State::Energy);
+       State referenceState  = referenceContext.getState(State::Forces | State::Energy);
+   
+       
+       if( log ){
+           (void) fprintf( log, "cudaE=%14.7e refE=%14.7e\n", state.getPotentialEnergy(), referenceState.getPotentialEnergy() );
+       }
+       
+       // Take a small step in the direction of the energy gradient.
+       
+       DoubleVector stats;
+       if( compareForcesOfTwoStates( state, referenceState, 0.001, stats, log ) ){
+          ASSERT_EQUAL_TOL(0.0, 1.0, 0.01)
+       }
+   
+       double norm        = 0.0;
+       double forceSum[3] = { 0.0, 0.0, 0.0 };
+       for (int i = 0; i < numParticles; ++i) {
+           Vec3 f       = state.getForces()[i];
+           norm        += f[0]*f[0] + f[1]*f[1] + f[2]*f[2];
+           forceSum[0] += f[0];
+           forceSum[1] += f[1];
+           forceSum[2] += f[2];
+       }
+       norm               = std::sqrt(norm);
+   
+       if( log ){
+           (void) fprintf( log, "Fsum [%14.7e %14.7e %14.7e] norm=%14.7e\n", forceSum[0], forceSum[1], forceSum[2], norm );
+       }
+   
+       const double delta = 1e-03;
+       double step        = delta/norm;
+       for (int i = 0; i < numParticles; ++i) {
+           Vec3 p = positions[i];
+           Vec3 f = state.getForces()[i];
+           positions[i] = Vec3(p[0]-f[0]*step, p[1]-f[1]*step, p[2]-f[2]*step);
+       }
+       context.setPositions(positions);
+       
+       State state2 = context.getState(State::Energy);
+   
+       double diff  = (state2.getPotentialEnergy()-state.getPotentialEnergy())/delta;
+       double off   = fabs( diff - norm )/norm;
+   
+       if( log ){
+           (void) fprintf( log, "%2d Energies %.8e %.8e norms[%13.7e %13.7e] deltaNorms=%13.7e delta=%.2e\n",
+                           ii, state.getPotentialEnergy(), state2.getPotentialEnergy(), diff, norm, off, delta );
+       }
+   
+       // See whether the potential energy changed by the expected amount.
+       
+       ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state.getPotentialEnergy())/delta, 1e-3*abs(state.getPotentialEnergy()) );
+
+       if( ii < (tries-1) ){
+           for( int jj = 0; jj < numParticles; jj++ ){
+              positions[jj][0]  = originalPositions[jj][0];
+              positions[jj][1]  = originalPositions[jj][1];
+              positions[jj][2]  = originalPositions[jj][2];
+           }
+       
+//           positions[8][2] -=  static_cast<double>(ii+1)*0.1;
+//           positions[8][2] -=  0.001;
+           if( log ){
+               (void) fprintf( log, "r48=%14.6e r28=%14.6e r24=%14.6e\n", positions[8][2]-positions[4][2], positions[8][2], positions[4][2] );
+           }
+       }
+#if 0
+       int carbonIndex    = 1;
+       int hydrogenIndex  = 0;
+       while( hydrogenIndex < 8 ){
+          Vec3 carbonDelta;
+          for( int kk = 0; kk < 3; kk++ ){
+             positions[hydrogenIndex][kk] += positionIncrement*(positions[carbonIndex][kk] - positions[hydrogenIndex][kk] );
+          }
+          double dist = 0.0;
+          for( int kk = 0; kk < 3; kk++ ){
+             dist += (positions[carbonIndex][kk] - positions[hydrogenIndex][kk] )*(positions[carbonIndex][kk] - positions[hydrogenIndex][kk]);
+          }
+           (void) fprintf( log, "H=%d C=%d r=%14.6e\n", hydrogenIndex, carbonIndex, dist );
+          hydrogenIndex++;
+          if( hydrogenIndex == carbonIndex ){
+             hydrogenIndex++;
+          }
+          if( carbonIndex == 1 && hydrogenIndex == 4 ){
+             carbonIndex    = 4;
+             hydrogenIndex  = 5;
+          }
+       }
+#endif
+
+   }
+}
+
+static GBVISoftcoreForce* copyGbviSoftcoreForce( const GBVISoftcoreForce& gbviSoftcoreForce ){
+
+    GBVISoftcoreForce* copyGbviSoftcoreForce = new GBVISoftcoreForce(gbviSoftcoreForce);
+/*
+    GBVISoftcoreForce* copyGbviSoftcoreForce = new GBVISoftcoreForce();
+
+    copyGbviSoftcoreForce->setNonbondedMethod( gbviSoftcoreForce.getNonbondedMethod() );
+
+    copyGbviSoftcoreForce->setCutoffDistance( gbviSoftcoreForce.getCutoffDistance() );
+
+    copyGbviSoftcoreForce->setSolventDielectric( gbviSoftcoreForce.getSolventDielectric() );
+    copyGbviSoftcoreForce->setSoluteDielectric( gbviSoftcoreForce.getSoluteDielectric() );
+
+    copyGbviSoftcoreForce->setBornRadiusScalingMethod( gbviSoftcoreForce.getBornRadiusScalingMethod() );
+    copyGbviSoftcoreForce->setQuinticLowerLimitFactor( gbviSoftcoreForce.getQuinticLowerLimitFactor() );
+    copyGbviSoftcoreForce->setQuinticUpperBornRadiusLimit( gbviSoftcoreForce.getQuinticUpperBornRadiusLimit() );
+
+    // particle parameters
+
+    for( unsigned int ii = 0; ii < gbviSoftcoreForce.getNumParticles(); ii++ ){
+
+        double charge;
+        double sigma;
+        double gamma;
+        double softcoreLJLambda;
+        gbviSoftcoreForce.getParticleParameters(ii, charge, sigma, gamma, softcoreLJLambda);
+        copyGbviSoftcoreForce->addParticle( charge, sigma, gamma, softcoreLJLambda);
+    }
+
+    // bonds
+
+    for( unsigned int ii = 0; ii < gbviSoftcoreForce.getNumBonds(); ii++ ){
+        int particle1, particle2;
+        double distance;
+        gbviSoftcoreForce.getBondParameters( ii, particle1, particle2, distance);
+        copyGbviSoftcoreForce->addBond( particle1, particle2, distance );
+    }
+*/
+    return copyGbviSoftcoreForce;
+}
+
+static GBVIForce* copyGbviForce( const GBVIForce& gbviForce ){
+    return new GBVIForce(gbviForce);
+}
+
+static GBSAOBCSoftcoreForce* copyGBSAOBCSoftcoreForce( const GBSAOBCSoftcoreForce& gbviSoftcoreForce ){
+    return new GBSAOBCSoftcoreForce(gbviSoftcoreForce);
+}
+
+static GBSAOBCForce* copyGbsaObcForce( const GBSAOBCForce& gbviForce ){
+    return new GBSAOBCForce(gbviForce);
+}
+
+void testGbviSoftcore( MapStringToDouble& inputArgumentMap, FILE* log ){
+
+    double lambda1                       = 1.0;
+    double lambda2                       = 1.0;
+    int nonbondedMethod                  = 0;
+    int numMolecules                     = 1;
+    int numParticlesPerMolecule          = 2;
+    int useQuinticSpline                 = 1;
+    int applyAssert                      = 1;
+    int positionPlacementMethod          = 0;
+    int serialize                        = 0;
+    double boxSize                       = 10.0;
+    double relativeTolerance             = 1.0e-04;
+
+    setDoubleFromMapStringToDouble( inputArgumentMap, "lambda1",                      lambda1 );
+    setDoubleFromMapStringToDouble( inputArgumentMap, "lambda2",                      lambda2 );
+    setDoubleFromMapStringToDouble( inputArgumentMap, "boxSize",                      boxSize );
+    double cutoffDistance                = boxSize*0.4;;
+    setDoubleFromMapStringToDouble( inputArgumentMap, "cutoffDistance",               cutoffDistance);
+    setDoubleFromMapStringToDouble( inputArgumentMap, "relativeTolerance",            relativeTolerance );
+
+    setIntFromMapStringToDouble(    inputArgumentMap, "positionPlacementMethod",      positionPlacementMethod ) ;
+    setIntFromMapStringToDouble(    inputArgumentMap, "nonbondedMethod",              nonbondedMethod );
+    setIntFromMapStringToDouble(    inputArgumentMap, "numMolecules",                 numMolecules );
+    setIntFromMapStringToDouble(    inputArgumentMap, "numParticlesPerMolecule",      numParticlesPerMolecule );
+    setIntFromMapStringToDouble(    inputArgumentMap, "serialize",                    serialize );
+   
+    if( nonbondedMethod == 2 && cutoffDistance > boxSize*0.5 ){
+        cutoffDistance = boxSize*0.5;
+    }
+
+    int numParticles                     = numMolecules*numParticlesPerMolecule;
+    int includeGbvi                      = 1;
+    double reactionFieldDielectric       = 80.0;
+
+    if( log ){
+        double particleDensity = static_cast<double>(numParticles)/(boxSize*boxSize*boxSize);
+        double particleCube    = pow( particleDensity, (-1.0/3.0) );
+      
+        (void) fprintf( log, "\n--------------------------------------------------------------------------------------\n" );
+        (void) fprintf( log, "Input arguments\n" );
+        (void) fflush( log );
+        (void) fprintf( log, "    includeGbvi                 %d\n", includeGbvi );
+        (void) fprintf( log, "    nonbondedMethod             %d\n", nonbondedMethod );
+        (void) fprintf( log, "    numParticles                %d\n", numParticles );
+        (void) fprintf( log, "    numMolecules                %d\n", numMolecules );
+        (void) fprintf( log, "    numParticlesPerMolecule     %d\n", numParticlesPerMolecule );
+        (void) fprintf( log, "    useQuinticSpline            %d\n", useQuinticSpline );
+        (void) fprintf( log, "    positionPlacementMethod     %d\n", positionPlacementMethod);
+
+#ifdef USE_SOFTCORE
+        (void) fprintf( log, "    lambda1                     %8.3f\n", lambda1 );
+        (void) fprintf( log, "    lambda2                     %8.3f\n", lambda2 );
+#endif
+        (void) fprintf( log, "    boxSize                     %8.3f\n", boxSize );
+        (void) fprintf( log, "    cutoffDistance              %8.3f\n", cutoffDistance );
+        (void) fprintf( log, "    reactionFieldDielectric     %8.3f\n", reactionFieldDielectric );
+        (void) fprintf( log, "    relativeTolerance           %8.1e\n", relativeTolerance );
+        (void) fprintf( log, "    particleDensity             %8.2e\n", particleDensity );
+        (void) fprintf( log, "    particleCube                %8.2e\n", particleCube );
+    }
+
+    // Create two systems: one with GbviSoftcoreForce NonbondedSoftcoreForce forces, and one using a CustomNonbondedForce, CustomGBVI force to implement the same interaction.
+
+    System standardSystem;
+    for (int i = 0; i < numParticles; i++) {
+        standardSystem.addParticle(1.0);
+    }
+    standardSystem.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+
+#ifdef USE_SOFTCORE
+    NonbondedSoftcoreForce* nonbondedSoftcoreForce   = new NonbondedSoftcoreForce();
+    if( nonbondedMethod == NoCutoff ){
+        nonbondedSoftcoreForce->setNonbondedMethod( NonbondedSoftcoreForce::NoCutoff );
+    } else {
+        if( nonbondedMethod == CutoffNonPeriodic ){
+            nonbondedSoftcoreForce->setNonbondedMethod( NonbondedSoftcoreForce::CutoffNonPeriodic );
+        } else {
+            nonbondedSoftcoreForce->setNonbondedMethod( NonbondedSoftcoreForce::CutoffPeriodic );
+        }
+    }
+#else
+    NonbondedForce* nonbondedSoftcoreForce = new NonbondedForce();
+    if( nonbondedMethod == NoCutoff ){
+        nonbondedSoftcoreForce->setNonbondedMethod( NonbondedForce::NoCutoff );
+    } else {
+        if( nonbondedMethod == CutoffNonPeriodic ){
+            nonbondedSoftcoreForce->setNonbondedMethod( NonbondedForce::CutoffNonPeriodic );
+        } else {
+            nonbondedSoftcoreForce->setNonbondedMethod( NonbondedForce::CutoffPeriodic );
+        }
+    }
+#endif
+    nonbondedSoftcoreForce->setCutoffDistance( cutoffDistance );
+    nonbondedSoftcoreForce->setReactionFieldDielectric( reactionFieldDielectric );
+
+#ifdef USE_SOFTCORE
+
+#if IMPLICIT_SOLVENT == GBVI_FLAG
+    GBVISoftcoreForce* gbviSoftcoreForce             = new GBVISoftcoreForce();
+    if( nonbondedMethod == NoCutoff ){
+        gbviSoftcoreForce->setNonbondedMethod( GBVISoftcoreForce::NoCutoff );
+    } else {
+        if( nonbondedMethod == CutoffNonPeriodic ){
+            gbviSoftcoreForce->setNonbondedMethod( GBVISoftcoreForce::CutoffNonPeriodic );
+        } else {
+            gbviSoftcoreForce->setNonbondedMethod( GBVISoftcoreForce::CutoffPeriodic );
+        }
+    }
+#else
+    GBSAOBCSoftcoreForce* gbviSoftcoreForce          = new GBSAOBCSoftcoreForce();
+    if( nonbondedMethod == NoCutoff ){
+        gbviSoftcoreForce->setNonbondedMethod( GBSAOBCSoftcoreForce::NoCutoff );
+    } else {
+        if( nonbondedMethod == CutoffNonPeriodic ){
+            gbviSoftcoreForce->setNonbondedMethod( GBSAOBCSoftcoreForce::CutoffNonPeriodic );
+        } else {
+            gbviSoftcoreForce->setNonbondedMethod( GBSAOBCSoftcoreForce::CutoffPeriodic );
+        }
+    }
+#endif
+
+#else
+
+#if IMPLICIT_SOLVENT == GBVI_FLAG
+    GBVIForce* gbviSoftcoreForce           = new GBVIForce();
+    if( nonbondedMethod == NoCutoff ){
+        gbviSoftcoreForce->setNonbondedMethod( GBVIForce::NoCutoff );
+    } else {
+        if( nonbondedMethod == CutoffNonPeriodic ){
+            gbviSoftcoreForce->setNonbondedMethod( GBVIForce::CutoffNonPeriodic );
+        } else {
+            gbviSoftcoreForce->setNonbondedMethod( GBVIForce::CutoffPeriodic );
+        }
+    }
+
+#else
+
+    GBSAOBCForce* gbviSoftcoreForce           = new GBSAOBCForce();
+    if( nonbondedMethod == NoCutoff ){
+        gbviSoftcoreForce->setNonbondedMethod( GBSAOBCForce::NoCutoff );
+    } else {
+        if( nonbondedMethod == CutoffNonPeriodic ){
+            gbviSoftcoreForce->setNonbondedMethod( GBSAOBCForce::CutoffNonPeriodic );
+        } else {
+            gbviSoftcoreForce->setNonbondedMethod( GBSAOBCForce::CutoffPeriodic );
+        }
+    }
+
+#endif
+
+#endif
+
+#if IMPLICIT_SOLVENT == GBVI_FLAG
+#ifdef USE_SOFTCORE
+    if( useQuinticSpline ){
+        gbviSoftcoreForce->setBornRadiusScalingMethod( GBVISoftcoreForce::QuinticSpline );
+    } else {
+        gbviSoftcoreForce->setBornRadiusScalingMethod( GBVISoftcoreForce::NoScaling );
+    }
+#else
+    if( useQuinticSpline ){
+        gbviSoftcoreForce->setBornRadiusScalingMethod( GBVIForce::QuinticSpline );
+    } else {
+        gbviSoftcoreForce->setBornRadiusScalingMethod( GBVIForce::NoScaling );
+    }
+#endif
+#endif
+
+    gbviSoftcoreForce->setSolventDielectric( 78.3 );
+    //gbviSoftcoreForce->setSolventDielectric( 1.0e+10 );
+    //gbviSoftcoreForce->setSolventDielectric( 1.0 );
+    gbviSoftcoreForce->setSoluteDielectric( 1.0 );
+    gbviSoftcoreForce->setCutoffDistance( nonbondedSoftcoreForce->getCutoffDistance( ) );
+
+    std::vector<Vec3> positions(numParticles);
+
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    PositionGenerator positionGenerator( numMolecules, numParticlesPerMolecule, boxSize );
+    if( log ){
+        positionGenerator.setLog( log );
+    }
+    if( positionPlacementMethod == 1 ){
+        positionGenerator.setPositions( PositionGenerator::SimpleGrid, sfmt, positions );
+    } else {
+        positionGenerator.setBondDistance( 0.3 );
+        positionGenerator.setPositions( PositionGenerator::Random, sfmt, positions );
+    }
+
+    // show info on particle positions
+
+    if( log ){
+        Vec3 box[2];
+        positionGenerator.getEnclosingBox( positions, box );
+        (void) fprintf( log, "Enclosing Box (in A): [%15.7e %15.7e] [%15.7e %15.7e] [%15.7e %15.7e]   [%15.7e %15.7e %15.7e]\n",
+                        box[0][0], box[1][0], box[0][1], box[1][1], box[0][2], box[1][2],
+                        (box[1][0] - box[0][0]), (box[1][1] - box[0][1]), (box[1][2] - box[0][2]) );
+
+        int showIndex                        = 5;
+        int periodicBoundaryConditions       = (nonbondedMethod == 2) ? 1 : 0;
+
+        IntVector positionIndexVector;
+        positionIndexVector.push_back( 0 );
+        positionIndexVector.push_back( static_cast<int>(positions.size())-1 );
+        //positionIndexVector.push_back( 542 );
+
+        for( unsigned int ii = 0; ii < positionIndexVector.size(); ii++ ){
+            if( positionIndexVector[ii] < positions.size() ){
+                int positionIndex = positionIndexVector[ii];
+                IntDoublePairVector sortVector;
+                positionGenerator.getSortedDistances( periodicBoundaryConditions, positionIndex, positions, sortVector );
+                (void) fprintf( log, "Min/max distance from %6d:\n    ", positionIndex );
+                for( unsigned int jj = 0; jj < sortVector.size() && jj < showIndex; jj++ ){
+                    IntDoublePair pair = sortVector[jj];
+                    (void) fprintf( log, "[%6d %15.7e] ", pair.first, pair.second);
+                }
+                (void) fprintf( log, "\n    " );
+                for( unsigned int jj = (sortVector.size() - showIndex); jj < sortVector.size() && jj >= 0; jj++ ){
+                    IntDoublePair pair = sortVector[jj];
+                    (void) fprintf( log, "[%6d %15.7e] ", pair.first, pair.second);
+                }
+                (void) fprintf( log, "\n" );
+            }
+        }
+        IntIntPairVector pairs;
+        pairs.push_back( IntIntPair( 732, 0 ) );
+        pairs.push_back( IntIntPair( 732, 1 ) );
+        pairs.push_back( IntIntPair( 732, 2 ) );
+        pairs.push_back( IntIntPair( 732, 3 ) );
+        pairs.push_back( IntIntPair( 732, 4 ) );
+        for( IntIntPairVectorCI ii = pairs.begin(); ii != pairs.end(); ii++ ){
+            if( ii->first < positions.size() && ii->second < positions.size() ){
+                 double d = positionGenerator.getDistance( ii->first, ii->second, positions );
+                 (void) fprintf( log, "Distance %6d %6d  %15.7e d2=%15.7e\n", ii->first, ii->second,  d, d*d );
+            }
+        }
+    }    
+
+    const int numberOfParameters             = 5;
+
+    const int ChargeIndex                    = 0;
+    const int SigmaIndex                     = 1;
+    const int EpsIndex                       = 2;
+    const int GammaIndex                     = 3;
+    const int LambdaIndex                    = 4;
+
+    std::vector<double> parameterLowerBound( numberOfParameters, 0.0 );
+
+    double fixedCharge                       = 1.0;
+    parameterLowerBound[ChargeIndex]         = fixedCharge;  // charge
+    parameterLowerBound[SigmaIndex]          = 0.1;          // sigma
+    parameterLowerBound[EpsIndex]            = 0.5;          // eps
+    parameterLowerBound[GammaIndex]          = 0.1;          // gamma
+    parameterLowerBound[LambdaIndex]         = lambda1;      // lambda
+
+    std::vector<double> parameterUpperBound( parameterLowerBound );
+    parameterUpperBound[ChargeIndex]         = fixedCharge;  // charge
+    parameterUpperBound[SigmaIndex]          = 0.3;          // sigma
+    parameterUpperBound[EpsIndex]            = 40.0;         // eps
+    parameterUpperBound[GammaIndex]          = 40.0;         // gamma
+
+#if IMPLICIT_SOLVENT == OBC_FLAG
+    parameterLowerBound[GammaIndex]          = 0.1;          // overlap factor
+    parameterUpperBound[GammaIndex]          = 1.5;        
+#endif
+
+    std::vector<double> parameters( numberOfParameters );
+    double charge = fixedCharge;
+
+    for( int ii = 0; ii < numMolecules; ii++) {
+
+        charge       *= -1.0;
+
+        double lambda =  ii < (numMolecules/2) ? lambda1 : lambda2;
+        randomizeParameters( parameterLowerBound, parameterUpperBound, sfmt, parameters );
+
+#ifdef USE_SOFTCORE
+        nonbondedSoftcoreForce->addParticle(   charge,  parameters[SigmaIndex],  parameters[EpsIndex],    lambda );
+        gbviSoftcoreForce->addParticle(        charge,  parameters[SigmaIndex],  parameters[GammaIndex],  lambda );
+#else
+        nonbondedSoftcoreForce->addParticle(   charge,  parameters[SigmaIndex],  parameters[EpsIndex] );
+        gbviSoftcoreForce->addParticle(        charge,  parameters[SigmaIndex],  parameters[GammaIndex] );
+#endif
+
+        int baseParticleIndex                    = ii*numParticlesPerMolecule;
+        for( int jj = 1; jj < numParticlesPerMolecule; jj++) {
+
+            // alternate charges
+
+            charge *= -1.0;
+
+            randomizeParameters( parameterLowerBound, parameterUpperBound, sfmt, parameters );
+
+#ifdef USE_SOFTCORE
+            nonbondedSoftcoreForce->addParticle(   charge,  parameters[SigmaIndex],  parameters[EpsIndex],    lambda );
+            gbviSoftcoreForce->addParticle(        charge,  parameters[SigmaIndex],  parameters[GammaIndex],  lambda );
+#else
+            nonbondedSoftcoreForce->addParticle(   charge,  parameters[SigmaIndex],  parameters[EpsIndex] );
+            gbviSoftcoreForce->addParticle(        charge,  parameters[SigmaIndex],  parameters[GammaIndex] );
+#endif
+
+            nonbondedSoftcoreForce->addException( baseParticleIndex, baseParticleIndex+jj, 0.0f, 1.0, 0.0f );
+
+#if IMPLICIT_SOLVENT == GBVI_FLAG
+            double bondDistance  = positionGenerator.getDistance( baseParticleIndex, baseParticleIndex+jj, positions );
+            gbviSoftcoreForce->addBond( baseParticleIndex, baseParticleIndex+jj,  bondDistance );
+#endif
+        }
+
+        // alternate charge if numParticlesPerMolecule is odd
+
+        if( (numParticlesPerMolecule % 2) ){
+            charge *= -1.0;
+        }
+    }
+
+    standardSystem.addForce(nonbondedSoftcoreForce);
+    if( includeGbvi ){
+        standardSystem.addForce(gbviSoftcoreForce);
+    }
+
+    // copy system and forces
+
+    System* systemCopy = copySystem( standardSystem );
+
+#ifdef USE_SOFTCORE
+    NonbondedSoftcoreForce* nonbondedSoftcoreForceCopy;
+    nonbondedSoftcoreForceCopy = copyNonbondedSoftcoreForce( *nonbondedSoftcoreForce );
+#else
+    NonbondedForce* nonbondedSoftcoreForceCopy;
+    nonbondedSoftcoreForceCopy = copyNonbondedForce( *nonbondedSoftcoreForce );
+#endif
+    systemCopy->addForce( nonbondedSoftcoreForceCopy );
+    std::stringstream baseFileName;
+
+    if( includeGbvi ){
+#ifdef USE_SOFTCORE
+
+#if IMPLICIT_SOLVENT == GBVI_FLAG
+        GBVISoftcoreForce* gBVISoftcoreForceCopy  = copyGbviSoftcoreForce( *gbviSoftcoreForce );
+        baseFileName  << "GBVISoftcore";
+#endif
+#if IMPLICIT_SOLVENT == OBC_FLAG
+        baseFileName  << "GBSAObcSoftcore";
+        GBSAOBCSoftcoreForce* gBVISoftcoreForceCopy       = copyGBSAOBCSoftcoreForce( *gbviSoftcoreForce );
+#endif
+        baseFileName  << "_lbda" << std::fixed << setprecision(2) << lambda2;
+
+#else
+
+#if IMPLICIT_SOLVENT == GBVI_FLAG
+        GBVIForce* gBVISoftcoreForceCopy          = copyGbviForce( *gbviSoftcoreForce );
+        baseFileName  << "Gbvi";
+#endif
+#if IMPLICIT_SOLVENT == OBC_FLAG
+        GBSAOBCForce* gBVISoftcoreForceCopy       = copyGbsaObcForce( *gbviSoftcoreForce );
+        baseFileName  << "GBSAOBC";
+#endif
+
+#endif
+        systemCopy->addForce( gBVISoftcoreForceCopy );
+    }
+
+    // perform comparison
+
+    std::stringstream idString;
+    idString << "Nb " << nonbondedMethod << " l2 " << std::fixed << setprecision(2) << lambda2;
+    runSystemComparisonTest( standardSystem, *systemCopy, "Cuda", "Reference", positions, inputArgumentMap, idString.str(), log );
+
+    // serialize
+
+    baseFileName  << "_N"     << positions.size();
+    baseFileName  << "_Nb"    << nonbondedMethod;
+    serializeSystemAndPositions( standardSystem, positions, baseFileName.str(), log);
+
+    delete systemCopy;
+
+}
+
+int main() {
+
+    try {
+
+        registerFreeEnergyCudaKernelFactories( );
+
+        VectorOfMapStringToDouble vectorOfMapStringToDouble;
+        MapStringToDouble inputArgumentMap;
+        MapStringToDoubleVector generativeArgumentMaps;
+        //FILE* log = stderr;
+        FILE* log = NULL;
+/*
+        testSingleParticle( log );
+
+        testEnergyEthaneSwitchingFunction( 0, log );
+        testEnergyEthaneSwitchingFunction( 1, log );
+*/
+
+        inputArgumentMap["lambda2"]                         = 1.0;
+        inputArgumentMap["nonbondedMethod"]                 = 0;
+        inputArgumentMap["numMolecules"]                    = 10;
+        inputArgumentMap["boxSize"]                         = 5.0;
+        inputArgumentMap["positionPlacementMethod"]         = 0;
+        inputArgumentMap["cutoffDistance"]                  = 0.3*inputArgumentMap["boxSize"];
+        //inputArgumentMap["cutoffDistance"]                  = 1.0;
+        inputArgumentMap["relativeTolerance"]               = 5.0e-04;
+        inputArgumentMap["serialize"]                       = 1;
+        //inputArgumentMap["numParticlesPerMolecule"]         = 2;
+
+#ifdef USE_SOFTCORE
+        DoubleVector lamda2;
+        lamda2.push_back( 1.0 );
+        lamda2.push_back( 0.5 );
+        lamda2.push_back( 0.0 );
+        if( lamda2.size() > 0 ){
+            generativeArgumentMaps["lambda2"] = lamda2;
+            inputArgumentMap["lambda2"]       = lamda2[0];
+        }   
+#endif
+
+        DoubleVector numberOfMolecules;
+        numberOfMolecules.push_back( 10 );
+        numberOfMolecules.push_back( 100 );
+        numberOfMolecules.push_back( 1000 );
+        //numberOfMolecules.push_back( 2000 );
+        //numberOfMolecules.push_back( 4000 );
+        //numberOfMolecules.push_back( 8000 );
+        if( numberOfMolecules.size() > 0 ){
+            generativeArgumentMaps["numMolecules"] = numberOfMolecules;
+            inputArgumentMap["numMolecules"]       = numberOfMolecules[0];
+        }   
+
+        DoubleVector nonbondedMethod;
+        nonbondedMethod.push_back( 0 );
+        nonbondedMethod.push_back( 1 );
+        nonbondedMethod.push_back( 2 );
+        if( nonbondedMethod.size() > 0 ){
+            generativeArgumentMaps["nonbondedMethod"] = nonbondedMethod;
+            inputArgumentMap["nonbondedMethod"]       = nonbondedMethod[0];
+        }
+
+        vectorOfMapStringToDouble.push_back( inputArgumentMap );
+        generateInputArgumentMapsFromStringVectors( generativeArgumentMaps, vectorOfMapStringToDouble ); 
+
+        // big box/many particle tests
+
+        //bool bigBox = true;
+        bool bigBox = false;
+        if( bigBox ){
+            MapStringToDouble inputArgumentMapBig;
+            VectorOfMapStringToDouble vectorOfMapStringToDoubleBig;
+            inputArgumentMapBig["lambda2"]                         = 1.0;
+            inputArgumentMapBig["nonbondedMethod"]                 = 1;
+            inputArgumentMapBig["numMolecules"]                    = 10;
+            inputArgumentMapBig["boxSize"]                         = 20.0;
+            inputArgumentMapBig["relativeTolerance"]               = 6.0e-04;
+            vectorOfMapStringToDoubleBig.push_back( inputArgumentMapBig );
+            //MapStringToDoubleVector generativeArgumentMapsBig;
+
+            numberOfMolecules.resize( 0 );
+            numberOfMolecules.push_back( 4000 );
+            generativeArgumentMaps["numMolecules"] = numberOfMolecules;
+
+            nonbondedMethod.resize( 0 );
+            nonbondedMethod.push_back( 1 );
+            nonbondedMethod.push_back( 2 );
+            generativeArgumentMaps["nonbondedMethod"] = nonbondedMethod;
+            generateInputArgumentMapsFromStringVectors( generativeArgumentMaps, vectorOfMapStringToDoubleBig ); 
+            vectorOfMapStringToDouble.resize( 0 );
+            vectorOfMapStringToDouble.insert( vectorOfMapStringToDouble.end(), vectorOfMapStringToDoubleBig.begin(), vectorOfMapStringToDoubleBig.end() );
+        }
+
+        if( log ){
+            MapStringToInt exclude;
+            exclude["lambda1"]                 = 1;
+            exclude["numParticlesPerMolecule"] = 1;
+            std::stringstream outputStream;
+            std::sort( vectorOfMapStringToDouble.begin(), vectorOfMapStringToDouble.end(), TestMapSortPredicate);
+            StringVector printOrder;
+            printOrder.push_back( "numMolecules" );
+            printOrder.push_back( "nonbondedMethod" );
+            printOrder.push_back( "lambda2" );
+            printOrder.push_back( "boxSize" );
+            for( unsigned int kk = 0; kk < vectorOfMapStringToDouble.size(); kk++ ){
+                streamArgumentMapOneLine( vectorOfMapStringToDouble[kk], exclude, printOrder, kk, outputStream );
+            }
+            (void) fprintf( log, "Initial argument maps: %u\n%s", static_cast<unsigned int>(vectorOfMapStringToDouble.size()), outputStream.str().c_str() );
+        }
+
+        // run tests
+
+        for( unsigned int kk = 0; kk < vectorOfMapStringToDouble.size(); kk++ ){
+            testGbviSoftcore( vectorOfMapStringToDouble[kk], log );
+            sleep(2);
+        }
+
+    } catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
--- a/plugins/freeEnergy/platforms/cuda/tests/TestCudaSoftcoreForce.h
+++ b/plugins/freeEnergy/platforms/cuda/tests/TestCudaSoftcoreForce.h
+#ifndef TEST_CUDA_SOFTCORE_H_
+#define TEST_CUDA_SOFTCORE_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * Utility methods shared across unit tests
+ */
+
+#include "../../../tests/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "openmm/System.h"
+#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "OpenMM.h"
+
+#include "OpenMMFreeEnergy.h"
+#include "openmm/freeEnergyKernels.h"
+
+#include "sfmt/SFMT.h"
+#include "openmm/VerletIntegrator.h"
+
+#ifdef OPENMM_SERIALIZE
+#include "../../../../../serialization/include/openmm/serialization/SerializationProxy.h"
+#include "../../../../../serialization/include/openmm/serialization/SerializationNode.h"
+#include "../../../../../serialization/include/openmm/serialization/XmlSerializer.h"
+//extern "C" void registerSerializationProxies();
+//extern "C" void registerAmoebaSerializationProxies();
+#endif
+
+#include <iostream>
+#include <vector>
+#include <algorithm>
+
+#include <iostream>
+#include <cstdio>
+#include <vector>
+#include <typeinfo>
+
+extern "C" void registerFreeEnergyCudaKernelFactories();
+
+using namespace OpenMM;
+using namespace std;
+
+const double TOL = 1e-5;
+
+static const int NoCutoff          = 0;
+static const int CutoffNonPeriodic = 1;
+static const int CutoffPeriodic    = 2;
+
+typedef std::vector<std::string> StringVector;
+typedef StringVector::iterator StringVectorI;
+typedef StringVector::const_iterator StringVectorCI;
+typedef std::vector<StringVector> StringVectorVector;
+
+typedef std::vector<int> IntVector;
+typedef IntVector::iterator IntVectorI;
+typedef IntVector::const_iterator IntVectorCI;
+typedef std::vector<IntVector> IntVectorVector;
+
+typedef std::vector<double> DoubleVector;
+typedef DoubleVector::iterator DoubleVectorI;
+typedef DoubleVector::const_iterator DoubleVectorCI;
+typedef std::vector<DoubleVector> DoubleVectorVector;
+
+// the following are used in parsing parameter file
+
+typedef std::vector<std::string> StringVector;
+typedef StringVector::iterator StringVectorI;
+typedef StringVector::const_iterator StringVectorCI;
+
+typedef std::vector<StringVector> VectorStringVector;
+typedef VectorStringVector::iterator VectorStringVectorI;
+typedef VectorStringVector::const_iterator VectorStringVectorCI;
+
+typedef std::vector<std::vector<double> > VectorOfVectors;
+typedef VectorOfVectors::iterator VectorOfVectorsI;
+typedef VectorOfVectors::const_iterator VectorOfVectorsCI;
+
+typedef std::map< int, int> MapIntInt;
+typedef MapIntInt::iterator MapIntIntI;
+typedef MapIntInt::const_iterator MapIntIntCI;
+
+typedef std::map< double, int> MapDoubleToInt;
+typedef MapDoubleToInt::iterator MapDoubleToIntI;
+typedef MapDoubleToInt::const_iterator MapDoubleToIntCI;
+
+typedef std::map< std::string, VectorOfVectors > MapStringVectorOfVectors;
+typedef MapStringVectorOfVectors::iterator MapStringVectorOfVectorsI;
+typedef MapStringVectorOfVectors::const_iterator MapStringVectorOfVectorsCI;
+
+typedef std::map< std::string, StringVector > MapStringStringVector;
+typedef MapStringStringVector::iterator MapStringStringVectorI;
+typedef MapStringStringVector::const_iterator MapStringStringVectorCI;
+
+typedef std::map< std::string, std::string > MapStringString;
+typedef MapStringString::iterator MapStringStringI;
+typedef MapStringString::const_iterator MapStringStringCI;
+
+typedef std::map< std::string, std::string > MapStringToInt;
+typedef MapStringToInt::iterator MapStringToIntI;
+typedef MapStringToInt::const_iterator MapStringToIntCI;
+
+typedef std::vector< std::map< std::string, std::string > > VectorMapStringString;
+typedef VectorMapStringString::iterator VectorMapStringStringI;
+typedef VectorMapStringString::const_iterator VectorMapStringStringCI;
+
+typedef std::map< std::string, int > MapStringInt;
+typedef MapStringInt::iterator MapStringIntI;
+typedef MapStringInt::const_iterator MapStringIntCI;
+
+typedef std::map< std::string,  std::vector<OpenMM::Vec3> > MapStringVec3;
+typedef MapStringVec3::iterator MapStringVec3I;
+typedef MapStringVec3::const_iterator MapStringVec3CI;
+
+typedef std::map< std::string, double > MapStringToDouble;
+typedef MapStringToDouble::iterator MapStringToDoubleI;
+typedef MapStringToDouble::const_iterator MapStringToDoubleCI;
+typedef std::vector< MapStringToDouble > VectorOfMapStringToDouble;
+
+typedef std::map< std::string, DoubleVector> MapStringToDoubleVector;
+typedef MapStringToDoubleVector::iterator MapStringToDoubleVectorI;
+typedef MapStringToDoubleVector::const_iterator MapStringToDoubleVectorCI;
+
+typedef std::map< std::string, DoubleVector  > MapStringToDoubleVector;
+
+typedef std::map< std::string, Force*> MapStringForce;
+typedef MapStringForce::iterator MapStringForceI;
+typedef MapStringForce::const_iterator MapStringForceCI;
+
+typedef std::map< int, IntVector> MapIntIntVector;
+typedef MapIntIntVector::const_iterator MapIntIntVectorCI;
+
+typedef std::pair<int, int> IntIntPair;
+typedef std::vector<IntIntPair> IntIntPairVector;
+typedef IntIntPairVector::iterator IntIntPairVectorI;
+typedef IntIntPairVector::const_iterator IntIntPairVectorCI;
+
+typedef std::pair<int, double> IntDoublePair;
+typedef std::vector<IntDoublePair> IntDoublePairVector;
+typedef IntDoublePairVector::iterator IntDoublePairVectorI;
+typedef IntDoublePairVector::const_iterator IntDoublePairVectorCI;
+
+class PositionGenerator {
+
+public:
+
+    enum GenerationMethod {
+        /** 
+         * Random positions
+         */
+        Random = 0,
+        /** 
+         * On grid
+         */
+        SimpleGrid = 1,
+    };  
+
+    PositionGenerator( int numMolecules, int numParticlesPerMolecule, double boxSize );
+
+    ~PositionGenerator( );
+
+    /** 
+     * Set logging file reference
+     *
+     * @param log                       log
+     *
+     */
+     
+    void setLog( FILE* log );
+    
+    /** 
+     * Set positions
+     *
+     * @param method                    method placement
+     * @param positions                 output vector of positions
+     *
+     * @return nonzero if error detected; 0 otherwise
+     */
+     
+    int setPositions( GenerationMethod method, std::vector<Vec3>& positions ) const;
+    
+    /** 
+     * Set positions
+     *
+     * @param method                    method placement
+     * @param sfmt                      input random number generator
+     * @param positions                 output vector of positions
+     *
+     * @return nonzero if error detected; 0 otherwise
+     */
+     
+    int setPositions( GenerationMethod method, OpenMM_SFMT::SFMT& sfmt, std::vector<Vec3>& positions ) const;
+    
+    /** 
+     * Place particles on a grid
+     *
+     * @param origin                    origin
+     * @param boxDimensions             box dimensions
+     * @param spacing                   spacing
+     * @param sfmt                      input random number generator
+     * @param array                     output vector of grid values
+     *
+     * @return -1 if particles will not fit on grid; 0 if they do
+     */
+     
+    int setParticlesOnGrid( const Vec3& origin, const Vec3& boxDimensions, const Vec3& spacing, 
+                            OpenMM_SFMT::SFMT& sfmt, std::vector<Vec3>& array ) const;
+    
+    /** 
+     * Set bond distance
+     *
+     * @param bondDistance bond distance
+     */
+     
+    void setBondDistance( double bondDistance );
+    
+    /** 
+     * Get bond distance
+     *
+     * @return bond distance
+     */
+     
+    double getBondDistance( void ) const;
+
+    /** 
+     * Get distance
+     *
+     * @param index1 index of first particle
+     * @param index2 index of second particle
+     * @param positions particle positions
+     *
+     * @return distance
+     */
+     
+    double getDistance( int index1, int index2, const std::vector<Vec3>& positions ) const;
+
+    /** 
+     * Get distance assumming periodic boundary conditions
+     *
+     * @param index1 index of first particle
+     * @param index2 index of second particle
+     * @param positions particle positions
+     *
+     * @return distance
+     */
+     
+    double getPeriodicDistance( int index1, int index2, const std::vector<Vec3>& positions ) const;
+
+    /** 
+     * Get settings
+     *
+     * @return info string
+     */
+     
+    std::string getSettings( void ) const;
+
+    /** 
+     * Get enclosing box
+     *
+     * @param positions    input vector of positions
+     * @param enclosingBox output vector of enclosing box dimensions 
+     *
+     */
+     
+    void getEnclosingBox( const std::vector<Vec3>& positions, Vec3 enclosingBox[2] ) const;
+
+    /** 
+     * Get sorted distances from particular position
+     *
+     * @param periodicBoundaryConditions if set, apply PBC
+     * @param positionIndex              input position index
+     * @param positions                  input vector of positions
+     * @param sortVector                 on output sorted IntDoublePairVector
+     *
+     */
+     
+    void getSortedDistances( int periodicBoundaryConditions, int positionIndex, const std::vector<Vec3>& positions, IntDoublePairVector& sortVector ) const;
+
+private:
+
+    int _numMolecules;
+    int _numParticlesPerMolecule;
+    int _numParticles;
+    int _seed;
+
+    double _boxSize;
+    double _bondDistance;
+    Vec3 _origin;
+    Vec3 _boxDimensions;
+    Vec3 _spacing;
+
+    FILE* _log;
+};
+
+PositionGenerator::PositionGenerator( int numMolecules, int numParticlesPerMolecule, double boxSize ) :
+               _numMolecules(numMolecules), 
+               _seed(0),
+               _log(NULL),
+               _bondDistance(0.1),
+               _numParticlesPerMolecule(numParticlesPerMolecule),
+               _numParticles(numMolecules*numParticlesPerMolecule),
+               _boxSize(boxSize),
+               _boxDimensions(Vec3(boxSize,boxSize,boxSize)),
+               _origin(Vec3(0.0,0.0,0.0)) {
+
+    double particlesPerDimension  = pow( static_cast<double>(_numParticles), (1.0/3.0) ); 
+    int particlesPerDimensionI    = static_cast<int>(particlesPerDimension+0.999999); 
+    double spacingPerDimension    = _boxSize/particlesPerDimension;
+
+    _spacing[0]                   = spacingPerDimension;
+    _spacing[1]                   = spacingPerDimension;
+    _spacing[2]                   = spacingPerDimension;
+
+}
+
+PositionGenerator::~PositionGenerator( ){};
+
+void PositionGenerator::setBondDistance( double bondDistance ){
+    _bondDistance = bondDistance;
+}
+
+void PositionGenerator::setLog( FILE* log ){
+    _log = log;
+}
+
+double PositionGenerator::getBondDistance( void ) const {
+    return _bondDistance;
+}
+
+double PositionGenerator::getDistance( int index1, int index2, const std::vector<Vec3>& positions ) const {
+
+    Vec3 delta = positions[index2] - positions[index1];
+    return sqrt( delta.dot( delta ) );
+}
+
+double PositionGenerator::getPeriodicDistance( int index1, int index2, const std::vector<Vec3>& positions ) const {
+
+    Vec3 delta  = positions[index2] - positions[index1];
+    if( _boxSize > 0.0 ){
+        delta[0]   -= floor(delta[0]/_boxSize+0.5f)*_boxSize;
+        delta[1]   -= floor(delta[1]/_boxSize+0.5f)*_boxSize;
+        delta[2]   -= floor(delta[2]/_boxSize+0.5f)*_boxSize;
+    }
+    return sqrt( delta.dot( delta ) );
+}
+
+/** 
+ * Get positions
+ *
+ * @param method                    method placement
+ * @param positions                 output vector of positions
+ *
+ * @return nonzero if error detected; 0 otherwise
+ */
+ 
+int PositionGenerator::setPositions( GenerationMethod method, std::vector<Vec3>& positions ) const {
+
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand( _seed, sfmt);
+    return setPositions( method, sfmt, positions );
+}
+
+/** 
+ * Get settings
+ *
+ * @return info string
+ */
+ 
+std::string PositionGenerator::getSettings( void ) const {
+
+    std::stringstream msg;
+    msg << "numMolecules            " << _numMolecules            << std::endl;
+    msg << "numParticlesPerMolecule " << _numParticlesPerMolecule << std::endl;
+    msg << "boxSize                 " << _boxSize                 << std::endl;
+    msg << "spacing                 " << _spacing[0]              << std::endl;
+    msg << "seed                    " << _seed                    << std::endl;
+
+    return msg.str();
+}
+
+/** 
+ * Get positions
+ *
+ * @param method                    method placement
+ * @param sfmt                      input random number generator
+ * @param positions                 output vector of positions
+ *
+ * @return nonzero if error detected; 0 otherwise
+ */
+ 
+int PositionGenerator::setPositions( GenerationMethod method, OpenMM_SFMT::SFMT& sfmt, std::vector<Vec3>& positions ) const {
+
+    int errorFlag = 0;
+    positions.resize( _numParticles );
+    if( method == Random ){
+        for( unsigned int ii = 0; ii < _numParticles; ii += _numParticlesPerMolecule ){ 
+            positions[ii]    = Vec3(_boxSize*genrand_real2(sfmt), _boxSize*genrand_real2(sfmt), _boxSize*genrand_real2(sfmt));
+            for( unsigned int jj = 1; jj < _numParticlesPerMolecule; jj++) { 
+                positions[ii+jj]  = positions[ii] + Vec3(_bondDistance*genrand_real2(sfmt), _bondDistance*genrand_real2(sfmt), _bondDistance*genrand_real2(sfmt));
+            }
+        }
+    } else if( method == SimpleGrid ){
+
+        Vec3 origin, boxDimensions, spacing;
+        std::stringstream msg;
+        if( _numParticlesPerMolecule > 1 && _bondDistance > 0.0 ){
+            origin                        = Vec3(_bondDistance, _bondDistance, _bondDistance );
+            double particlesPerDimension  = pow( static_cast<double>(_numParticles), (1.0/3.0) ); 
+            int particlesPerDimensionI    = static_cast<int>(particlesPerDimension+0.999999); 
+            double boxSize                = (_boxSize-2.0*_bondDistance);
+            double spacingPerDimension    = boxSize/particlesPerDimension;
+            spacing                       = Vec3(spacingPerDimension, spacingPerDimension, spacingPerDimension );
+            boxDimensions                 = Vec3(boxSize, boxSize, boxSize );
+
+            msg << "Bond distance " << _bondDistance << std::endl;
+            msg << "particlesPerDimension " << particlesPerDimension << std::endl;
+            msg << "boxSize " << boxSize << std::endl;
+            msg << "spacingPerDimension " << spacingPerDimension << std::endl;
+
+        } else {
+           origin                         = _origin;
+           spacing                        = _spacing;
+           boxDimensions                  = _boxDimensions;
+
+        }
+        msg << getSettings() << std::endl;
+
+        if( _log ){
+            (void) fprintf( _log, "SimpleGrid %s\n", msg.str().c_str() );
+        }
+
+        errorFlag = setParticlesOnGrid( origin, boxDimensions, spacing, sfmt, positions );
+    }
+
+    return errorFlag;
+}
+
+/** 
+ * Place particles on a grid
+ *
+ * @param origin                    origin
+ * @param boxDimensions             box dimensions
+ * @param spacing                   spacing
+ * @param array                     output vector of grid values
+ *
+ * @return -1 if particles will not fit on grid; 0 if they do
+ */
+ 
+int PositionGenerator::setParticlesOnGrid( const Vec3& origin, const Vec3& boxDimensions, const Vec3& spacing, OpenMM_SFMT::SFMT& sfmt,
+                                           std::vector<Vec3>& array ) const {
+
+    Vec3 start(origin);
+
+    if( array.size() != _numParticles ){
+        std::stringstream msg;
+        msg << "PositionGenerator::setParticlesOnGrid position vector size=" << array.size() << " != numParticles=" << _numParticles;
+        msg << getSettings();
+        throw OpenMMException( msg.str() );
+    }
+
+    // place molecule centers on grid
+
+    for( unsigned int ii = 0; ii < _numParticles; ii += _numParticlesPerMolecule ){
+        array[ii]  = Vec3(start);
+        bool done  = false;
+        for( unsigned int jj = 0; jj < 3 && !done; jj++ ){
+            start[jj]  += spacing[jj];
+            if( start[jj] > boxDimensions[jj] ){
+                start[jj] = origin[jj];
+            } else {
+                done = true;
+            }
+        }
+        if( !done ){
+            std::stringstream msg;
+            msg << "PositionGenerator::setParticlesOnGrid error in grid settings";
+            throw OpenMMException( msg.str() );
+        }
+    }
+
+    // add molecule atoms
+
+    Vec3 bondOffset( 0.05, 0.05, 0.05 );
+    for( unsigned int ii = 0; ii < _numMolecules; ii++ ){
+        int molecularIndex = ii*_numParticlesPerMolecule;
+        for( unsigned int jj = 1; jj < _numParticlesPerMolecule; jj++ ){
+            array[molecularIndex+jj] = array[molecularIndex] + bondOffset + Vec3(_bondDistance*genrand_real2(sfmt), _bondDistance*genrand_real2(sfmt), _bondDistance*genrand_real2(sfmt));
+        }
+    }
+
+    return 0;
+}
+
+/** 
+ * Get enclosing box
+ *
+ * @param positions    input vector of positions
+ * @param enclosingBox output Vec3[2] of minimum enclosing box ranges
+ *
+ */
+ 
+void PositionGenerator::getEnclosingBox( const std::vector<Vec3>& positions, Vec3 enclosingBox[2] ) const {
+
+    enclosingBox[0][0] = enclosingBox[1][0] = positions[0][0];
+    enclosingBox[0][1] = enclosingBox[1][1] = positions[0][1];
+    enclosingBox[0][2] = enclosingBox[1][2] = positions[0][2];
+ 
+    for( unsigned int ii = 1; ii < positions.size(); ii++ ){
+        if( enclosingBox[0][0] > positions[ii][0] ){
+            enclosingBox[0][0] = positions[ii][0];
+        }    
+        if( enclosingBox[1][0] < positions[ii][0] ){
+            enclosingBox[1][0] = positions[ii][0];
+        }    
+        if( enclosingBox[0][1] > positions[ii][1] ){
+            enclosingBox[0][1] = positions[ii][1];
+        }    
+        if( enclosingBox[1][1] < positions[ii][1] ){
+            enclosingBox[1][1] = positions[ii][1];
+        }    
+        if( enclosingBox[0][2] > positions[ii][2] ){
+            enclosingBox[0][2] = positions[ii][2];
+        }    
+        if( enclosingBox[1][2] < positions[ii][2] ){
+            enclosingBox[1][2] = positions[ii][2];
+        }    
+    }    
+ 
+    return;
+}
+
+
+/** 
+ * Predicate for sorting <int,double> pair
+ *
+ * @param d1 first  IntDoublePair to compare
+ * @param d2 second IntDoublePair to compare
+ *
+ */
+ 
+bool TestIntDoublePair( const IntDoublePair& d1, const IntDoublePair& d2 ){
+   return d1.second < d2.second;
+}
+
+/** 
+ * Get sorted distances from particular position
+ *
+ * @param periodicBoundaryConditions if set, apply PBC
+ * @param positionIndex              input position index
+ * @param positions                  input vector of positions
+ * @param sortVector                 on output sorted IntDoublePairVector
+ *
+ */
+ 
+void PositionGenerator::getSortedDistances( int periodicBoundaryConditions, int positionIndex, const std::vector<Vec3>& positions,
+                                            IntDoublePairVector& sortVector ) const {
+
+    sortVector.resize( 0 );
+    for( unsigned int ii = 0; ii < positions.size(); ii++ ){
+        if( ii == positionIndex )continue;
+        double distance = periodicBoundaryConditions ? getPeriodicDistance( positionIndex, ii, positions) :  getDistance( positionIndex, ii, positions);
+        sortVector.push_back( IntDoublePair(ii,sqrt(distance) ) );
+    }    
+
+    std::sort( sortVector.begin(), sortVector.end(), TestIntDoublePair );
+
+    return;
+}
+
+/**---------------------------------------------------------------------------------------
+ *
+ * Set string field if in map
+ * 
+ * @param  argumentMap            map to check
+ * @param  fieldToCheck           key
+ * @param  fieldToSet             field to set
+ *
+ * @return 1 if argument set, else 0
+ *
+   --------------------------------------------------------------------------------------- */
+
+static int setStringFromMap( MapStringString& argumentMap, std::string fieldToCheck, std::string& fieldToSet ){
+
+// ---------------------------------------------------------------------------------------
+
+   //static const std::string methodName             = "setStringFromMap";
+
+// ---------------------------------------------------------------------------------------
+
+   MapStringStringCI check = argumentMap.find( fieldToCheck );
+   if( check != argumentMap.end() ){
+      fieldToSet = (*check).second; 
+      return 1;
+   }
+   return 0;
+}
+
+/**---------------------------------------------------------------------------------------
+ *
+ * Set int field if in map
+ * 
+ * @param  argumentMap            map to check
+ * @param  fieldToCheck           key
+ * @param  fieldToSet             field to set
+ *
+ * @return 1 if argument set, else 0
+ *
+   --------------------------------------------------------------------------------------- */
+
+static int setIntFromMap( MapStringString& argumentMap, std::string fieldToCheck, int& fieldToSet ){
+
+// ---------------------------------------------------------------------------------------
+
+   //static const std::string methodName             = "setIntFromMap";
+
+// ---------------------------------------------------------------------------------------
+
+   MapStringStringCI check = argumentMap.find( fieldToCheck );
+   if( check != argumentMap.end() ){
+      fieldToSet = atoi( (*check).second.c_str() ); 
+      return 1;
+   }
+   return 0;
+}
+
+/**---------------------------------------------------------------------------------------
+ *
+ * Set int field if in map
+ * 
+ * @param  argumentMap            map to check
+ * @param  fieldToCheck           key
+ * @param  fieldToSet             field to set
+ *
+ * @return 1 if argument set, else 0
+ *
+   --------------------------------------------------------------------------------------- */
+
+static int setIntFromMapStringToDouble( MapStringToDouble& argumentMap, std::string fieldToCheck, int& fieldToSet ){
+
+// ---------------------------------------------------------------------------------------
+
+   MapStringToDoubleCI check = argumentMap.find( fieldToCheck );
+   if( check != argumentMap.end() ){
+      fieldToSet   = static_cast<int>(check->second+0.0000001);
+      return 1;
+   }
+   return 0;
+}
+
+/**---------------------------------------------------------------------------------------
+
+ * Set float field if in map
+ * 
+ * @param  argumentMap            map to check
+ * @param  fieldToCheck           key
+ * @param  fieldToSet             field to set
+ *
+ * @return 1 if argument set, else 0
+ *
+   --------------------------------------------------------------------------------------- */
+
+static int setFloatFromMap( MapStringString& argumentMap, std::string fieldToCheck, float& fieldToSet ){
+
+// ---------------------------------------------------------------------------------------
+
+   static const std::string methodName             = "setFloatFromMap";
+
+// ---------------------------------------------------------------------------------------
+
+   MapStringStringCI check = argumentMap.find( fieldToCheck );
+   if( check != argumentMap.end() ){
+      fieldToSet = static_cast<float>(atof( (*check).second.c_str() )); 
+      return 1;
+   }
+   return 0;
+}
+
+/**---------------------------------------------------------------------------------------
+ *
+ * Set double field if in map
+ * 
+ * @param  argumentMap            map to check
+ * @param  fieldToCheck           key
+ * @param  fieldToSet             field to set
+ *
+ * @return 1 if argument set, else 0
+ *
+   --------------------------------------------------------------------------------------- */
+
+static int setDoubleFromMap( MapStringString& argumentMap, std::string fieldToCheck, double& fieldToSet ){
+
+// ---------------------------------------------------------------------------------------
+
+   MapStringStringCI check = argumentMap.find( fieldToCheck );
+   if( check != argumentMap.end() ){
+      fieldToSet = atof( (*check).second.c_str() ); 
+      return 1;
+   }
+   return 0;
+}
+
+/**---------------------------------------------------------------------------------------
+ *
+ * Set double field if in map
+ * 
+ * @param  argumentMap            map to check
+ * @param  fieldToCheck           key
+ * @param  fieldToSet             field to set
+ *
+ * @return 1 if argument set, else 0
+ *
+   --------------------------------------------------------------------------------------- */
+
+static int setDoubleFromMapStringToDouble( MapStringToDouble& argumentMap, std::string fieldToCheck, double& fieldToSet ){
+
+// ---------------------------------------------------------------------------------------
+
+   MapStringToDoubleCI check = argumentMap.find( fieldToCheck );
+   if( check != argumentMap.end() ){
+      fieldToSet = check->second; 
+      return 1;
+   }
+   return 0;
+}
+
+/**---------------------------------------------------------------------------------------
+ *
+ * Compare forces from two states
+ * 
+ * @param  state1               state1
+ * @param  state2               state2
+ * @param  relativeTolerance    relative tolerance
+ * @param  log                  if set, output forces
+ *
+ * @return number of entries with relative difference > tolerance 
+ *
+   --------------------------------------------------------------------------------------- */
+
+int compareForcesOfTwoStates( State& state1, State& state2, double relativeTolerance,
+                              DoubleVector& stats, FILE* log ) {
+
+    int error                             = 0;
+    vector<Vec3> force1                   = state1.getForces();
+    vector<Vec3> force2                   = state2.getForces();
+    double maxRelativeDifference          = -1.0e+30;
+    double maxRelativeDifferenceIndex     = -1.0;
+    double averageRelativeDifference      = 0.0;
+    double count                          = 0.0;
+    DoubleVector medians1( force1.size() );
+    DoubleVector medians2( force1.size() );
+    for( unsigned int ii = 0; ii < force1.size(); ii++ ){
+
+        Vec3 f1                = force1[ii];
+        Vec3 f2                = force2[ii];
+
+        double diff            = (f1[0] - f2[0])*(f1[0] - f2[0]) +
+                                 (f1[1] - f2[1])*(f1[1] - f2[1]) +
+                                 (f1[2] - f2[2])*(f1[2] - f2[2]); 
+
+        double denom1          = sqrt( f1[0]*f1[0] + f1[1]*f1[1] + f1[2]*f1[2] );
+        double denom2          = sqrt( f2[0]*f2[0] + f2[1]*f2[1] + f2[2]*f2[2] );
+        medians1[ii]            = denom1;
+        medians2[ii]            = denom2;
+        double relativeDiff;
+        if( denom1 > 0.0 || denom2 > 0.0 ){
+            relativeDiff = 2.0*sqrt( diff )/(denom1+denom2);
+        } else {
+            relativeDiff = 0.0;
+        }
+
+        if( relativeDiff > maxRelativeDifference ){
+            maxRelativeDifference      = relativeDiff;
+            maxRelativeDifferenceIndex = static_cast<double>(ii);
+        }
+        averageRelativeDifference += relativeDiff;
+        count                     += 1.0;
+
+        if( relativeDiff > relativeTolerance ){
+           error++;
+        }
+        if( log ){
+            (void) fprintf( log, "F %6u %15.7e [%15.7e %15.7e %15.7e] [%15.7e %15.7e %15.7e] %15.7e %15.7e %s\n", static_cast<unsigned int>(ii), 
+                            relativeDiff, f1[0], f1[1], f1[2], f2[0], f2[1], f2[2], denom1, denom2, (relativeDiff < relativeTolerance ? "":"XXXXXX") );
+        }
+    }
+
+    if( count > 0.0 ){
+        averageRelativeDifference /= count;
+    }
+
+    std::sort( medians1.begin(), medians1.end() );
+    std::sort( medians2.begin(), medians2.end() );
+    double median1 = medians1[medians1.size()/2];
+    double median2 = medians2[medians2.size()/2];
+
+    stats.resize( 4 );
+    stats[0] = averageRelativeDifference;
+    stats[1] = maxRelativeDifference;
+    stats[2] = maxRelativeDifferenceIndex;
+    stats[3] = median1 < median2 ? median1 : median2;
+    
+    return error;
+}
+
+
+/** 
+ * Get forces in system
+ *
+ * @param system                   system to serialize
+ * @param stringForceVector        output stringForceVector[forceName] = force index
+ * @param log                      logging file (optional -- may be NULL)
+ *
+ */
+ 
+static void getStringForceMap( System& system, MapStringInt& stringForceVector, FILE* log ){
+
+    // print active forces and relevant parameters
+
+    for( int ii = 0; ii < system.getNumForces(); ii++ ) {
+
+        int hit                 = 0;
+        Force& force            = system.getForce(ii);
+        if( !hit ){
+
+            try {
+               CMAPTorsionForce& castForce = dynamic_cast<CMAPTorsionForce&>(force);
+               stringForceVector["CMAPTorsion"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               CustomAngleForce& castForce = dynamic_cast<CustomAngleForce&>(force);
+               stringForceVector["CustomAngle"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+
+        if( !hit ){
+
+            try {
+               CustomBondForce& castForce = dynamic_cast<CustomBondForce&>(force);
+               stringForceVector["CustomBond"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               CustomExternalForce& castForce = dynamic_cast<CustomExternalForce&>(force);
+               stringForceVector["CustomExternal"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               CustomGBForce& castForce = dynamic_cast<CustomGBForce&>(force);
+               stringForceVector["CustomGB"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               CustomHbondForce& castForce = dynamic_cast<CustomHbondForce&>(force);
+               stringForceVector["CustomHbond"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               CustomNonbondedForce& castForce = dynamic_cast<CustomNonbondedForce&>(force);
+               stringForceVector["CustomNonbonded"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+
+        if( !hit ){
+
+            try {
+               CustomTorsionForce& castForce = dynamic_cast<CustomTorsionForce&>(force);
+               stringForceVector["CustomTorsion"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+
+        if( !hit ){
+
+            try {
+               GBSAOBCForce& castForce = dynamic_cast<GBSAOBCForce&>(force);
+               stringForceVector["GBSAOBC"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               GBVIForce& castForce = dynamic_cast<GBVIForce&>(force);
+               stringForceVector["GBVI"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               HarmonicAngleForce& castForce = dynamic_cast<HarmonicAngleForce&>(force);
+               stringForceVector["HarmonicAngle"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+
+        if( !hit ){
+
+            try {
+               HarmonicBondForce& castForce = dynamic_cast<HarmonicBondForce&>(force);
+               stringForceVector["HarmonicBond"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               NonbondedForce& castForce = dynamic_cast<NonbondedForce&>(force);
+               stringForceVector["Nonbonded"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               PeriodicTorsionForce& castForce = dynamic_cast<PeriodicTorsionForce&>(force);
+               stringForceVector["PeriodicTorsion"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               RBTorsionForce& castForce = dynamic_cast<RBTorsionForce&>(force);
+               stringForceVector["RBTorsion"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               MonteCarloBarostat& castForce = dynamic_cast<MonteCarloBarostat&>(force);
+               stringForceVector["MonteCarloBarostat"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               AndersenThermostat& castForce = dynamic_cast<AndersenThermostat&>(force);
+               stringForceVector["AndersenThermostat"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+#ifdef USE_SOFTCORE
+        if( !hit ){
+
+            try {
+               GBSAOBCSoftcoreForce& castForce = dynamic_cast<GBSAOBCSoftcoreForce&>(force);
+               stringForceVector["GBSAOBCSoftcore"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               GBVISoftcoreForce& castForce = dynamic_cast<GBVISoftcoreForce&>(force);
+               stringForceVector["GBVISoftcore"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               NonbondedSoftcoreForce& castForce = dynamic_cast<NonbondedSoftcoreForce&>(force);
+               stringForceVector["NonbondedSoftcore"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+#endif
+
+#ifdef INCLUDE_AMOEBA_FORCES
+
+        if( !hit ){
+
+            try {
+               AmoebaHarmonicBondForce& castForce = dynamic_cast<AmoebaHarmonicBondForce&>(force);
+               stringForceVector["AmoebaHarmonicBond"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               AmoebaHarmonicAngleForce& castForce = dynamic_cast<AmoebaHarmonicAngleForce&>(force);
+               stringForceVector["AmoebaHarmonicAngle"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               AmoebaHarmonicInPlaneAngleForce& castForce = dynamic_cast<AmoebaHarmonicInPlaneAngleForce&>(force);
+               stringForceVector["AmoebaHarmonicInPlaneAngle"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               AmoebaMultipoleForce& castForce = dynamic_cast<AmoebaMultipoleForce&>(force);
+               stringForceVector["AmoebaMultipole"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               AmoebaOutOfPlaneBendForce& castForce = dynamic_cast<AmoebaOutOfPlaneBendForce&>(force);
+               stringForceVector["AmoebaOutOfPlaneBend"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               AmoebaPiTorsionForce& castForce = dynamic_cast<AmoebaPiTorsionForce&>(force);
+               stringForceVector["AmoebaPiTorsion"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               AmoebaStretchBendForce& castForce = dynamic_cast<AmoebaStretchBendForce&>(force);
+               stringForceVector["AmoebaStretchBend"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               AmoebaTorsionForce& castForce = dynamic_cast<AmoebaTorsionForce&>(force);
+               stringForceVector["AmoebaTorsion"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               AmoebaTorsionTorsionForce& castForce = dynamic_cast<AmoebaTorsionTorsionForce&>(force);
+               stringForceVector["AmoebaTorsionTorsion"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               AmoebaUreyBradleyForce& castForce = dynamic_cast<AmoebaUreyBradleyForce&>(force);
+               stringForceVector["AmoebaUreyBradley"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               AmoebaVdwForce& castForce = dynamic_cast<AmoebaVdwForce&>(force);
+               stringForceVector["AmoebaVdw"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               AmoebaWcaDispersionForce& castForce = dynamic_cast<AmoebaWcaDispersionForce&>(force);
+               stringForceVector["AmoebaWcaDispersion"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               AmoebaGeneralizedKirkwoodForce& castForce = dynamic_cast<AmoebaGeneralizedKirkwoodForce&>(force);
+               stringForceVector["AmoebaGeneralizedKirkwood"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+        if( !hit ){
+
+            try {
+               AmoebaTorsionTorsionForce& castForce = dynamic_cast<AmoebaTorsionTorsionForce&>(force);
+               stringForceVector["AmoebaTorsionTorsionForce"] = ii;
+               hit++;
+            } catch( std::bad_cast ){
+            }    
+        }
+
+#endif
+
+        // COM
+
+        if( !hit ){
+    
+            try {
+               CMMotionRemover& cMMotionRemover = dynamic_cast<CMMotionRemover&>(force);
+               hit++;
+            } catch( std::bad_cast ){
+            }
+        }
+
+        if( !hit && log ){
+           (void) fprintf( log, "   entry=%2d force not recognized.\n", ii );
+        }
+
+    }
+}
+
+/**---------------------------------------------------------------------------------------
+ *
+ * Copy NonbondedSoftcoreForce
+ * 
+ * @param  nonbondedSoftcoreForce  NonbondedSoftcoreForce to copy
+ *
+ * @return copy of nonbondedSoftcoreForce
+ *
+   --------------------------------------------------------------------------------------- */
+
+static NonbondedSoftcoreForce* copyNonbondedSoftcoreForce( const NonbondedSoftcoreForce& nonbondedSoftcoreForce ){
+
+    NonbondedSoftcoreForce* copyNonbondedSoftcoreForce = new NonbondedSoftcoreForce( nonbondedSoftcoreForce );
+
+/*
+    copyNonbondedSoftcoreForce->setNonbondedMethod( nonbondedSoftcoreForce.getNonbondedMethod() );
+    copyNonbondedSoftcoreForce->setCutoffDistance( nonbondedSoftcoreForce.getCutoffDistance() );
+    copyNonbondedSoftcoreForce->setReactionFieldDielectric( nonbondedSoftcoreForce.getReactionFieldDielectric() );
+
+    // particle parameters
+
+    for( unsigned int ii = 0; ii < nonbondedSoftcoreForce.getNumParticles(); ii++ ){
+
+        double charge;
+        double sigma;
+        double epsilon;
+        double softcoreLJLambda;
+        nonbondedSoftcoreForce.getParticleParameters(ii, charge, sigma, epsilon, softcoreLJLambda);
+        copyNonbondedSoftcoreForce->addParticle( charge, sigma, epsilon, softcoreLJLambda);
+    }
+
+    // exceptions
+
+    for( unsigned int ii = 0; ii < nonbondedSoftcoreForce.getNumExceptions(); ii++ ){
+
+        int particle1, particle2;
+        double chargeProd;
+        double sigma;
+        double epsilon;
+        double softcoreLJLambda;
+        nonbondedSoftcoreForce.getExceptionParameters( ii, particle1, particle2, chargeProd, sigma, epsilon, softcoreLJLambda );
+        copyNonbondedSoftcoreForce->addException( particle1, particle2, chargeProd, sigma, epsilon, softcoreLJLambda );
+    }
+*/
+
+    return copyNonbondedSoftcoreForce;
+}
+
+/**---------------------------------------------------------------------------------------
+ *
+ * Copy NonbondedForce
+ * 
+ * @param  nonbondedForce  NonbondedForce to copy
+ *
+ * @return copy of nonbondedForce
+ *
+   --------------------------------------------------------------------------------------- */
+
+static NonbondedForce* copyNonbondedForce( const NonbondedForce& nonbondedForce ){
+
+    NonbondedForce* copyNonbondedForce = new NonbondedForce( nonbondedForce );
+
+/*
+    copyNonbondedForce->setNonbondedMethod( nonbondedForce.getNonbondedMethod() );
+    copyNonbondedForce->setCutoffDistance( nonbondedForce.getCutoffDistance() );
+    copyNonbondedForce->setReactionFieldDielectric( nonbondedForce.getReactionFieldDielectric() );
+
+    // particle parameters
+
+    for( unsigned int ii = 0; ii < nonbondedForce.getNumParticles(); ii++ ){
+
+        double charge;
+        double sigma;
+        double epsilon;
+        double softcoreLJLambda;
+        nonbondedForce.getParticleParameters(ii, charge, sigma, epsilon, softcoreLJLambda);
+        copyNonbondedForce->addParticle( charge, sigma, epsilon, softcoreLJLambda);
+    }
+
+    // exceptions
+
+    for( unsigned int ii = 0; ii < nonbondedForce.getNumExceptions(); ii++ ){
+
+        int particle1, particle2;
+        double chargeProd;
+        double sigma;
+        double epsilon;
+        double softcoreLJLambda;
+        nonbondedForce.getExceptionParameters( ii, particle1, particle2, chargeProd, sigma, epsilon, softcoreLJLambda );
+        copyNonbondedForce->addException( particle1, particle2, chargeProd, sigma, epsilon, softcoreLJLambda );
+    }
+*/
+
+    return copyNonbondedForce;
+}
+
+/** 
+ * Return copy of system (but not forces)
+ *
+ * @param inputSystem               system to copy
+ *
+ * @return system copy
+ *
+ */
+ 
+static System* copySystem( const System& inputSystem ){
+
+    System* systemCopy = new System();
+
+    for( unsigned int ii = 0; ii < inputSystem.getNumParticles(); ii++ ){
+        systemCopy->addParticle( inputSystem.getParticleMass( static_cast<int>(ii) ) );
+    }
+
+    Vec3 a;
+    Vec3 b;
+    Vec3 c;
+    inputSystem.getDefaultPeriodicBoxVectors( a, b, c );
+    systemCopy->setDefaultPeriodicBoxVectors( a, b, c );
+
+    for( unsigned int ii = 0; ii < inputSystem.getNumConstraints(); ii++ ){
+        int index;
+        int particle1, particle2;
+        double distance;
+        inputSystem.getConstraintParameters( ii, particle1, particle2, distance);
+        systemCopy->addConstraint( particle1, particle2, distance);
+    }
+
+    return systemCopy;
+}
+
+/** 
+ * Randomize parameters
+ *
+ * @param parametersLowerBound      vector of parameter lower bounds
+ * @param parametersUpperBound      vector of parameter upper bounds
+ * @param sfmt                      SFMT random number generator
+ * @param parameters                output vector of randomized parameter values
+ *
+ */
+ 
+static void randomizeParameters( const std::vector<double>& parametersLowerBound, 
+                                 const std::vector<double>& parametersUpperBound,
+                                 OpenMM_SFMT::SFMT& sfmt, std::vector<double>& parameters ){
+
+    if( parametersLowerBound.size() != parametersUpperBound.size() ){
+        std::stringstream msg;
+        msg << " randomizeParameters parametersLowerBound size=" << parametersLowerBound.size() << " != parametersUpperBound size=" << parametersUpperBound.size();
+        throw OpenMMException( msg.str() );
+    }
+
+    if( parametersLowerBound.size() != parameters.size() ){
+        std::stringstream msg;
+        msg << " randomizeParameters parametersLowerBound size=" << parametersLowerBound.size() << " != parameter size=" << parameters.size();
+        throw OpenMMException( msg.str() );
+    }
+
+    for( unsigned int ii = 0; ii < parametersLowerBound.size(); ii++ ){
+        parameters[ii] = parametersLowerBound[ii] + (parametersUpperBound[ii] - parametersLowerBound[ii])*(genrand_real2(sfmt));
+    }
+
+    return;
+}
+
+/** 
+ * Randomize Vec3 vector
+ *
+ * @param average                   mean value
+ * @param range                     +/- range
+ * @param sfmt                      SFMT random number generator
+ * @param array                     output vector of randomized values
+ *
+ */
+ 
+static void randomizeVec3( double average, double range, 
+                           OpenMM_SFMT::SFMT& sfmt, std::vector<Vec3>& array ){
+
+    range *= 2.0;
+    for( unsigned int ii = 0; ii < array.size(); ii++ ){
+        array[ii] = Vec3( average + range*(genrand_real2(sfmt) - 0.5),
+                          average + range*(genrand_real2(sfmt) - 0.5), 
+                          average + range*(genrand_real2(sfmt) - 0.5) );
+    }
+    return;
+}
+
+/** 
+ * Output contents of MapStringString 
+ *
+ * @param inputArgumentMap          map to output
+ * @param outputStream              output stream
+ *
+ */
+ 
+static void streamArgumentMap( const MapStringString& inputArgumentMap, std::stringstream& outputStream ){ 
+
+    char buffer[2048];
+    for( MapStringStringCI ii = inputArgumentMap.begin(); ii != inputArgumentMap.end(); ii++ ){
+        std::string key   = ii->first;
+        std::string value = ii->second;
+        (void) sprintf( buffer, "      %30s %40s\n", key.c_str(), value.c_str() );
+        outputStream << buffer;
+    }    
+
+    return;
+}
+
+/** 
+ * Format argument/value
+ *
+ * @param buffer                    formatted output
+ * @param key                       argument name
+ * @param value                     argument value
+ * @param format                    format string
+ * @param call                      if call > 0, skip key name
+ * @param type                      type == 0, then use int value; else double
+ *
+ */
+ 
+static void formatArgument( char* buffer, const std::string& key, double value, const char* format, int call, int type ){
+
+    // if call > 0, skip key name
+
+    unsigned int index   = 0;
+    while( index < key.size() ){
+        buffer[index] = call ? ' ' : key[index];
+        index++;
+    }
+
+    // add blank
+
+    buffer[index++]        = ' ';
+    buffer[index]          = static_cast<char>(NULL);
+
+    if( type == 0 ){
+        int valueInt       = static_cast<int>(value+0.00001);
+        (void) sprintf( buffer + index, format, valueInt );
+    } else {
+        (void) sprintf( buffer + index, format, value );
+    }
+    return;
+}
+
+/** 
+ * Output contents of MapStringString w/ all argument on one line 
+ *
+ * @param inputArgumentMap          map to output
+ * @param exclude                   map of keys to exclude from output
+ * @param outputStream              output stream
+ *
+ */
+ 
+static void streamArgumentMapOneLine( const MapStringToDouble& inputArgumentMap, const MapStringToInt& exclude,
+                                      const StringVector& printFirst, int callId, std::stringstream& outputStream ){ 
+
+    char buffer[2048];
+
+    MapStringToInt excludeAll(exclude);
+
+    for( unsigned int ii = 0; ii < printFirst.size(); ii++ ){
+        MapStringToDoubleCI iter = inputArgumentMap.find( printFirst[ii] );
+        if( iter != inputArgumentMap.end() ){
+            std::string key      = iter->first;
+            if( exclude.find( key ) == exclude.end() ){
+                double      value    = iter->second;
+
+                if( key == "numMolecules" ){
+                    formatArgument( buffer, key, value, "%6d ", callId, 0 );
+                } else if( key == "nonbondedMethod" ){
+                    formatArgument( buffer, key, value, "%1d ", callId, 0 );
+                } else if( key == "lambda1" || key == "lambda2" ){
+                    formatArgument( buffer, key, value, "%4.2f ", callId, 1 );
+                } else if( key == "boxSize" ){
+                    formatArgument( buffer, key, value, "%6.2f ", callId, 1 );
+                } else {
+                    formatArgument( buffer, key, value, "%15.7e ", callId, 1 );
+                }
+                outputStream << buffer;
+                excludeAll[key] = 1;
+            }
+        }
+    }    
+
+    for( MapStringToDoubleCI ii = inputArgumentMap.begin(); ii != inputArgumentMap.end(); ii++ ){
+        std::string key      = ii->first;
+        if( excludeAll.find( key ) == excludeAll.end() ){
+            double      value    = ii->second;
+            int valueInt         = static_cast<int>(value+0.00001);
+            double valueDouble   = static_cast<double>(valueInt);
+            if( key == "numMolecules" ){
+                (void) sprintf( buffer, "%s=%6d ", key.c_str(), valueInt );
+            } else if( key == "nonbondedMethod" ){
+                (void) sprintf( buffer, "%s=%1d ", key.c_str(), valueInt );
+            } else if( key == "lambda1" || key == "lambda2" ){
+                (void) sprintf( buffer, "%s=%4.2f ", key.c_str(), value );
+            } else if( key == "boxSize" ){
+                (void) sprintf( buffer, "%s=%6.2f ", key.c_str(), value );
+            } else if( valueDouble == value ){
+                (void) sprintf( buffer, "%s=%6d ", key.c_str(), valueInt );
+            } else {
+                (void) sprintf( buffer, "%s=%15.7e ", key.c_str(), value );
+            }
+            outputStream << buffer;
+        }
+    }    
+    outputStream << std::endl;
+
+    return;
+}
+
+/** 
+ * Get signature of a MapStringToDouble  object
+ *
+ * @param inputArgumentMap          map
+ * @return signature
+ *
+ */
+ 
+static double getMapStringToDoubleSignature( const MapStringToDouble& inputArgumentMap ){
+
+    double signature = 0.0;
+    double offset    = 0.1;
+    for( MapStringToDoubleCI ii = inputArgumentMap.begin(); ii != inputArgumentMap.end(); ii++ ){
+        signature           += (offset + ii->second);
+        offset              += 0.1;
+    }
+    return signature;
+}
+
+/** 
+ * Compare two MapStringToDouble to see if they have the same (key,value) pairs
+ *
+ * @param inputArgumentMap1 map 1
+ * @param inputArgumentMap2 map 2
+ *
+ * @return true if maps have  same (key,value) pairs; otherwise false
+ *
+ */
+ 
+static bool compareMapStringToDoubles( const MapStringToDouble& inputArgumentMap1, const MapStringToDouble& inputArgumentMap2 ){
+
+    if( inputArgumentMap1.size() != inputArgumentMap1.size() ){
+        return false;
+    }
+    for( MapStringToDoubleCI ii = inputArgumentMap1.begin(); ii != inputArgumentMap1.end(); ii++ ){
+        MapStringToDoubleCI jj = inputArgumentMap2.find( (*ii).first );
+        if( jj == inputArgumentMap2.end() || jj->second != ii->second ){
+            return false;
+        }
+    }
+    return true;
+}
+
+/** 
+ * Generate collection of inputArguments maps given
+ * list of DoubleVectors for each argument
+ *
+ * @param inputArguments            map[argumentKey] = vector of double parameter values
+ * @param argumentMaps              output vector of generated maps
+ *
+ */
+ 
+static void generateInputArgumentMapsFromStringVectors( const MapStringToDoubleVector& inputArguments, 
+                                                        VectorOfMapStringToDouble& argumentMaps ){
+
+    for( MapStringToDoubleVectorCI ii = inputArguments.begin(); ii != inputArguments.end(); ii++ ){
+
+        std::string  argumentName           = (*ii).first;
+        DoubleVector arguments              = (*ii).second;
+        unsigned int initialArgumentMapSize = argumentMaps.size();
+
+        // generate signature map for each argument map
+
+        MapDoubleToInt signatures;
+        for( unsigned int kk = 0; kk < initialArgumentMapSize; kk++ ){
+            double signature      = getMapStringToDoubleSignature( argumentMaps[kk] ); 
+            signatures[signature] = 1;
+        }
+
+        // for each current argumment map, add a new argument map w/ (key,value)
+        // check that no existing map has the same arguments before adding to the 
+        // vector of argument maps
+
+        for( unsigned int kk = 0; kk < initialArgumentMapSize; kk++ ){
+            for( unsigned int jj = 0; jj < arguments.size(); jj++ ){
+               MapStringToDouble inputArgumentMap = MapStringToDouble(argumentMaps[kk]);
+               inputArgumentMap[argumentName]     = arguments[jj];
+               double signature = getMapStringToDoubleSignature( inputArgumentMap ); 
+               if( signatures.find( signature ) == signatures.end() ){
+                   argumentMaps.push_back( inputArgumentMap );
+               } else {
+                   bool match = 0;
+                   for( unsigned int mm = 0; mm < initialArgumentMapSize && !match; mm++ ){
+                       match = compareMapStringToDoubles( inputArgumentMap, argumentMaps[mm] );
+                   }
+                   if( !match ){
+                       argumentMaps.push_back( inputArgumentMap );
+                   }
+               }
+            }
+        }
+    }
+
+    return;
+}
+
+/** 
+ * Predicate for sorting map[string] = double
+ *
+ * @param d1 first  MapStringToDouble to compare
+ * @param d2 second MapStringToDouble to compare
+ *
+ */
+ 
+bool TestMapSortPredicate( const MapStringToDouble& d1, const MapStringToDouble& d2 ){
+    StringVector sortOrder;
+    sortOrder.push_back( "numMolecules" );
+    sortOrder.push_back( "nonbondedMethod" );
+    sortOrder.push_back( "lambda2" );
+    sortOrder.push_back( "boxSize" );
+    for( unsigned int ii = 0; ii < sortOrder.size(); ii++ ){
+        if( d1.find( sortOrder[ii] ) != d1.end() &&
+            d2.find( sortOrder[ii] ) != d2.end() ){
+           MapStringToDoubleCI d1i = d1.find( sortOrder[ii] );
+           MapStringToDoubleCI d2i = d2.find( sortOrder[ii] );
+           if( d1i->second != d2i->second ){
+               return d1i->second < d2i->second;
+           }
+        }
+    }
+    return false;
+}
+
+
+static CustomNonbondedForce* buildCustomNonbondedSoftcoreForce(  const NonbondedSoftcoreForce& nonbondedSoftcoreForce ){
+
+    CustomNonbondedForce* customNonbonded;
+    if( nonbondedSoftcoreForce.getNonbondedMethod() == NoCutoff ){
+
+        customNonbonded          = new CustomNonbondedForce("lambda*4*eps*(dem^2-dem)+138.935456*q/r;"
+                                                            "q=q1*q2;"
+                                                            "dem=1.0/(soft+rsig);"
+                                                            "rsig=(r/sigma)^6;"
+                                                            "rsig=(r/sigma)^6;"
+                                                            "soft=0.5*(1.0-lambda);"
+                                                            "sigma=0.5*(sigma1+sigma2);"
+                                                            "eps=sqrt(eps1*eps2);"
+                                                            "lambda=min(lambda1,lambda2)");
+
+        customNonbonded->setNonbondedMethod( CustomNonbondedForce::NoCutoff );
+
+    } else {
+
+        customNonbonded          = new CustomNonbondedForce("lambda*4*eps*(dem^2-dem)+138.935456*q*(1.0/r+(krf*r*r)-crf);"
+                                                            "q=q1*q2;"
+                                                            "dem=1.0/(soft+rsig);"
+                                                            "rsig=(r/sigma)^6;"
+                                                            "rsig=(r/sigma)^6;"
+                                                            "soft=0.5*(1.0-lambda);"
+                                                            "sigma=0.5*(sigma1+sigma2);"
+                                                            "eps=sqrt(eps1*eps2);"
+                                                            "lambda=min(lambda1,lambda2)");
+
+        customNonbonded->setCutoffDistance( nonbondedSoftcoreForce.getCutoffDistance() );
+        if( nonbondedSoftcoreForce.getNonbondedMethod() == CutoffNonPeriodic ){
+            customNonbonded->setNonbondedMethod( CustomNonbondedForce::CutoffNonPeriodic );
+        } else {
+            customNonbonded->setNonbondedMethod( CustomNonbondedForce::CutoffPeriodic );
+        }
+
+        double cutoffDistance           = nonbondedSoftcoreForce.getCutoffDistance();
+        double reactionFieldDielectric  = nonbondedSoftcoreForce.getReactionFieldDielectric();
+
+        double eps2                     = (reactionFieldDielectric - 1.0)/(2.0*reactionFieldDielectric+1.0);
+        double kValue                   = eps2/(cutoffDistance*cutoffDistance*cutoffDistance);
+        customNonbonded->addGlobalParameter("krf", kValue );
+
+        double cValue                   = (1.0/cutoffDistance)*(3.0*reactionFieldDielectric)/(2.0*reactionFieldDielectric + 1.0); 
+        customNonbonded->addGlobalParameter("crf", cValue );
+    }
+
+    customNonbonded->addPerParticleParameter("q");
+    customNonbonded->addPerParticleParameter("sigma");
+    customNonbonded->addPerParticleParameter("eps");
+    customNonbonded->addPerParticleParameter("lambda");
+
+    vector<double> nonbondedParams(4);
+    for( unsigned int ii = 0; ii < nonbondedSoftcoreForce.getNumParticles(); ii++ ){
+
+        double charge;
+        double sigma;
+        double epsilon;
+        double softcoreLJLambda;
+        nonbondedSoftcoreForce.getParticleParameters(ii, charge, sigma, epsilon, softcoreLJLambda);
+
+        nonbondedParams[0] = charge;
+        nonbondedParams[1] = sigma;
+        nonbondedParams[2] = epsilon;
+        nonbondedParams[3] = softcoreLJLambda;
+        customNonbonded->addParticle( nonbondedParams );
+    }
+
+    return customNonbonded;
+}
+
+CustomBondForce* buildCustomBondForceForNonbondedExceptions( const NonbondedSoftcoreForce& nonbondedSoftcoreForce ){
+
+    CustomBondForce* customBond;
+    if( nonbondedSoftcoreForce.getNonbondedMethod() == NoCutoff ){
+
+        customBond               = new CustomBondForce("lambda*4*eps*(dem^2-dem)+138.935456*q/r;"
+                                                       "dem=1.0/(soft+rsig);"
+                                                       "rsig=(r/sigma)^6;"
+                                                       "soft=0.5*(1.0-lambda)");
+
+    } else {
+
+        customBond               = new CustomBondForce("withinCutoff*(lambda*4*eps*(dem^2-dem)+138.935456*q*(1.0/r+(krf*r*r)-crf));"
+                                                       "withinCutoff=step(cutoff-r);"
+                                                       "dem=1.0/(soft+rsig);"
+                                                       "rsig=(r/sigma)^6;"
+                                                       "soft=0.5*(1.0-lambda)");
+ 
+
+        double cutoffDistance           = nonbondedSoftcoreForce.getCutoffDistance();
+        double reactionFieldDielectric  = nonbondedSoftcoreForce.getReactionFieldDielectric();
+        double eps2                     = (reactionFieldDielectric - 1.0)/(2.0*reactionFieldDielectric+1.0);
+        double kValue                   = eps2/(cutoffDistance*cutoffDistance*cutoffDistance);
+        customBond->addGlobalParameter("krf", kValue );
+
+        double cValue                   = (1.0/cutoffDistance)*(3.0*reactionFieldDielectric)/(2.0*reactionFieldDielectric + 1.0); 
+        customBond->addGlobalParameter("crf", cValue );
+        customBond->addGlobalParameter("cutoff", cutoffDistance );
+    }
+
+    customBond->addPerBondParameter("q");
+    customBond->addPerBondParameter("sigma");
+    customBond->addPerBondParameter("eps");
+    customBond->addPerBondParameter("lambda");
+
+    for( unsigned int ii = 0; ii < nonbondedSoftcoreForce.getNumExceptions(); ii++ ){
+
+        int particle1, particle2;
+        double chargeProd;
+        double sigma;
+        double epsilon;
+        double softcoreLJLambda;
+        nonbondedSoftcoreForce.getExceptionParameters( ii, particle1, particle2, chargeProd, sigma, epsilon, softcoreLJLambda );
+
+        vector<double> bondParams(4);
+
+        bondParams[0] = chargeProd;
+        bondParams[1] = sigma;
+        bondParams[2] = epsilon;
+        bondParams[3] = softcoreLJLambda;
+        customBond->addBond( particle1, particle2, bondParams );
+    }
+
+    return customBond;
+}
+
+/** 
+ * Perform comparison of energies/forces for two systems
+ *
+ * @param system1                  first  system
+ * @param system2                  second system
+ * @param platform1                first  platform name (Reference, Cuda, OpenCL)
+ * @param platform2                second platform name (Reference, Cuda, OpenCL)
+ * @param positions                positions
+ * @param inputArgumentMap         arguments/flags (relativeTolerance, applyAssert, ...)
+ * @param idString                 id string
+ * @param log                      logging file (optional -- may be NULL)
+ *
+ */
+ 
+void runSystemComparisonTest( System& system1, System& system2, 
+                              const std::string& platform1, const std::string& platform2,
+                              const std::vector<Vec3>& positions, MapStringToDouble& inputArgumentMap,
+                              const std::string& idString, FILE* log ){
+
+    int applyAssert                      = 0;
+    double relativeTolerance             = 1.0e-04;
+
+    setDoubleFromMapStringToDouble( inputArgumentMap, "relativeTolerance",            relativeTolerance );
+    setIntFromMapStringToDouble(    inputArgumentMap, "applyAssert",                  applyAssert ) ;
+
+    VerletIntegrator integrator1(0.01);
+    VerletIntegrator integrator2(0.01);
+
+    if( log ){
+        (void) fprintf( log, "System1: particles=%d forces=%d    System2: particles=%d forces=%d\n",
+                        system1.getNumParticles(), system1.getNumForces(),
+                        system2.getNumParticles(), system2.getNumForces() );
+        (void) fprintf( log, "Positions=%u\n",
+                        static_cast<unsigned int>(positions.size()) );
+
+        MapStringInt stringForceVector1;
+        MapStringInt stringForceVector2;
+        getStringForceMap( system1, stringForceVector1, log );
+        (void) fprintf( log, "Forces in system 1: [" );
+        for( MapStringIntCI ii = stringForceVector1.begin(); ii != stringForceVector1.end(); ii++ ){
+            (void) fprintf( log, " %s ", ii->first.c_str() );
+        }
+
+        getStringForceMap( system2, stringForceVector2, log );
+        (void) fprintf( log, "]\nForces in system 2: [" );
+        for( MapStringIntCI ii = stringForceVector2.begin(); ii != stringForceVector2.end(); ii++ ){
+            (void) fprintf( log, " %s ", ii->first.c_str() );
+        }
+        (void) fprintf( log, "]\n" );
+    }
+
+    if( system1.getNumParticles() != system2.getNumParticles() ){
+        std::stringstream msg;
+        msg << "Number of particles for systems to be compared are unequal: " << system1.getNumParticles() << " != " << system2.getNumParticles();
+        throw OpenMMException( msg.str() );
+    }
+ 
+    if( system1.getNumParticles() != static_cast<int>(positions.size()) ){
+        std::stringstream msg;
+        msg << "Number of partciles for system des not equal size of position array: " << system1.getNumParticles() << " != " << positions.size();
+        throw OpenMMException( msg.str() );
+    }
+ 
+    Context context1( system1, integrator1, Platform::getPlatformByName( platform1 ));
+    context1.setPositions(positions);
+    State state1 = context1.getState(State::Forces | State::Energy);
+
+    Context context2( system2, integrator2, Platform::getPlatformByName( "Reference"));
+    context2.setPositions(positions);
+
+    State state2 = context2.getState(State::Forces | State::Energy);
+
+    double energyDiff = 0.0;
+    if( fabs( state1.getPotentialEnergy() ) > 0.0 || fabs( state2.getPotentialEnergy()) > 0.0 ){
+        energyDiff = fabs( state1.getPotentialEnergy() - state2.getPotentialEnergy() )/( fabs( state1.getPotentialEnergy() ) + fabs( state2.getPotentialEnergy() ) );
+    }
+
+    if( log ){
+        DoubleVector stats;
+        compareForcesOfTwoStates( state1, state2, relativeTolerance, stats, log );
+        (void) fprintf( log, "%s %6d eDff=%15.7e fMx=%15.7e fAvg=%15.7e fMed=%15.7e eCd=%15.7e eRf=%15.7e mxFIdx=%d\n",
+                        idString.c_str(), system1.getNumParticles(), energyDiff,
+                        stats[1], stats[0], stats[3], state1.getPotentialEnergy(), state2.getPotentialEnergy(), static_cast<int>(stats[2]+0.0001));
+        (void) fflush( log );
+    }
+
+    if( applyAssert ){
+        ASSERT( energyDiff < relativeTolerance );
+        for( int ii = 0; ii < system1.getNumParticles(); ii++ ){
+    
+            Vec3 f1     = state1.getForces()[ii];
+            Vec3 f2     = state2.getForces()[ii];
+    
+            double f1N  = sqrt( (f1[0]*f1[0]) + (f1[1]*f1[1]) + (f1[2]*f1[2]) );
+            double f2N  = sqrt( (f2[0]*f2[0]) + (f2[1]*f2[1]) + (f2[2]*f2[2]) );
+    
+            double diff = (f1[0]-f2[0])*(f1[0]-f2[0]) +
+                          (f1[1]-f2[1])*(f1[1]-f2[1]) +
+                          (f1[2]-f2[2])*(f1[2]-f2[2]);
+            if( f1N > 0.0 || f1N > 0.0 ){
+                diff    = 2.0*sqrt( diff )/(f1N + f2N);
+            }
+            ASSERT( diff < relativeTolerance );
+        }
+    }
+
+}
+
+/** 
+ * Serialize system
+ *
+ * @param system                   system to serialize
+ * @param serializeFileName        file name for xml output
+ * @param log                      logging file (optional -- may be NULL)
+ *
+ */
+ 
+void serializeSystem( System& system, const std::string& serializeFileName, FILE* log ){
+
+#ifdef OPENMM_SERIALIZE
+    //registerAmoebaSerializationProxies();
+    std::stringstream buffer;
+    XmlSerializer::serialize<System>(&system, "System", buffer);
+    FILE* filePtr = fopen( serializeFileName.c_str(), "w" );
+    if( filePtr == NULL ){
+        if( log ){
+            (void) fprintf( log, "Unable to open xml file %s\n", serializeFileName.c_str() );
+            return;
+        }
+    }
+    (void) fprintf( filePtr, "%s", buffer.str().c_str() );
+    (void) fclose( filePtr );
+    if( log ){
+        (void) fprintf( log, "Wrote system to xml file %s\n", serializeFileName.c_str() );
+    }
+#endif
+    return;
+}
+
+/** 
+ * Output vector of Vec3 to file
+ *
+ * @param positions                system to serialize
+ * @param fileName                 file name for output
+ * @param log                      logging file (optional -- may be NULL)
+ *
+ */
+ 
+void serializeVectorOfVec3( const std::vector<Vec3>& positions, std::string fileName, FILE* log ){
+#ifdef OPENMM_SERIALIZE
+    FILE* filePtr = fopen( fileName.c_str(), "w" );
+    if( filePtr == NULL ){
+        if( log ){
+            (void) fprintf( log, "Unable to open Vec3 file %s\n", fileName.c_str() );
+            return;
+        }
+    }
+    (void) fprintf( filePtr, "Positions  %u\n", static_cast<unsigned int>(positions.size()) );
+    for( unsigned int ii = 0; ii < positions.size(); ii++ ){
+        (void) fprintf( filePtr, "%9u %17.10e %17.10e %17.10e\n", ii, positions[ii][0], positions[ii][1], positions[ii][2] );
+    }
+    (void) fclose( filePtr );
+    if( log ){
+        (void) fprintf( log, "Wrote to file %s\n", fileName.c_str() );
+    }
+#endif
+    return;
+}
+
+/** 
+ * Serialize system and positions
+ *
+ * @param system                   system to serialize
+ * @param positions                positions to output
+ * @param baseFileName             base file name for xml/txt output
+ * @param log                      logging file (optional -- may be NULL)
+ *
+ */
+ 
+void serializeSystemAndPositions( System& system, const std::vector<Vec3>& positions, const std::string& baseFileName, FILE* log ){
+
+    std::stringstream xmlfileName;
+    xmlfileName << baseFileName << ".xml";
+    serializeSystem( system, xmlfileName.str(), log );
+
+    std::stringstream posfileName;
+    posfileName << baseFileName << ".txt";
+    serializeVectorOfVec3( positions, posfileName.str(), log );
+
+    return;
+}
+#endif // TEST_CUDA_SOFTCORE_FORCE_H_
--- a/plugins/freeEnergy/platforms/cuda/tests/TstFreeEnergyCudaUsingParameterFile.cpp
+++ b/plugins/freeEnergy/platforms/cuda/tests/TstFreeEnergyCudaUsingParameterFile.cpp