dded GB/VI to Cuda platform

Free energy plugin added Plugin will not run w/ Obc or GB/VI forces unless line 2004 of gpu.cpp (gpu->sim.totalNonbondOutputBuffers = 2*gpu->sim.nonbondOutputBuffers;) is commented in -- working on removing this constraint Also unit tests for GB/VI currently fail

dded GB/VI to Cuda platform
Free energy plugin added Plugin will not run w/ Obc or GB/VI forces unless line 2004 of gpu.cpp (gpu->sim.totalNonbondOutputBuffers = 2*gpu->sim.nonbondOutputBuffers;) is commented in -- working on removing this constraint Also unit tests for GB/VI currently fail
7a36f461 · Mark Friedrichs · 43ebedfb · 7a36f461 · 7a36f461 · 7a36f461
Commit 7a36f461 authored Nov 03, 2009 by Mark Friedrichs
20 changed files
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This file contains the kernel for calculating Born sums.  It is included
+ * several times in kCalculateGBVIBornSum.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+
+#include "kCalculateGBVIAux.h"
+
+__global__ void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int* workUnit)
+{
+    extern __shared__ Atom sA[];
+
+    unsigned int totalWarps   = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
+    unsigned int warp         = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    unsigned int numWorkUnits = cSim.pInteractionCount[0];
+    unsigned int pos          = warp*numWorkUnits/totalWarps;
+    unsigned int end          = (warp+1)*numWorkUnits/totalWarps;
+
+//    int end = workUnits / gridDim.x;
+//    int pos = end - (threadIdx.x >> GRIDBITS) - 1;
+#ifdef USE_CUTOFF
+    float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
+#endif
+
+    while ( pos < end )
+    {
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x = workUnit[pos];
+        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
+        x              = (x >> 17)           << GRIDBITS;
+
+        float       dx;
+        float       dy;
+        float       dz;
+        float       r2;
+        float       r;
+
+        // forces tgx into interval [0,31] 
+        // forces tbx 0 
+        unsigned int tgx = threadIdx.x & (GRID - 1);
+        unsigned int tbx = threadIdx.x - tgx;
+        unsigned int tj  = tgx;
+        Atom* psA        = &sA[tbx];
+
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        {
+            // Read fixed atom data into registers and GRF
+            unsigned int i                          = x + tgx;
+            float4 apos                             = cSim.pPosq[i];    // Local atom x, y, z, sum
+            float4 ar                               = cSim.pGBVIData[i];  // Local atom vr, sr
+            sA[threadIdx.x].x                       = apos.x;
+            sA[threadIdx.x].y                       = apos.y;
+            sA[threadIdx.x].z                       = apos.z;
+            sA[threadIdx.x].r                       = ar.x;
+            sA[threadIdx.x].sr                      = ar.y;
+            sA[threadIdx.x].bornRadiusScaleFactor   = ar.w;
+            apos.w                                  = 0.0f;
+
+            for (unsigned int j             = 0; j < GRID; j++)
+            {
+                dx                                  = psA[j].x - apos.x;
+                dy                                  = psA[j].y - apos.y;
+                dz                                  = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                r2                      = dx * dx + dy * dy + dz * dz;
+#if defined USE_PERIODIC
+                if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
+#elif defined USE_CUTOFF
+                if (r2 < cSim.nonbondedCutoffSqr)
+#endif
+                {
+                    r                       = sqrt(r2);
+                    if ((j != tgx) )
+                    {
+                        apos.w             += psA[j].bornRadiusScaleFactor*getGBVI_Volume( r, ar.x, psA[j].sr );
+                    }
+                }
+            }
+
+            // Write results
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset = x + tgx + warp*cSim.stride;
+            cSim.pBornSum[offset] += apos.w;
+#else
+            unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pBornSum[offset] = apos.w;
+#endif
+        }
+        else        // 100% utilization
+        {
+            // Read fixed atom data into registers and GRF
+            unsigned int j                              = y + tgx;
+            unsigned int i                              = x + tgx;
+
+            float4 temp                                 = cSim.pPosq[j];
+            float4 temp1                                = cSim.pGBVIData[j];
+            float4 apos                                 = cSim.pPosq[i];        // Local atom x, y, z, sum
+            float4 ar                                   = cSim.pGBVIData[i];    // Local atom vr, sr
+            sA[threadIdx.x].x                           = temp.x;
+            sA[threadIdx.x].y                           = temp.y;
+            sA[threadIdx.x].z                           = temp.z;
+            sA[threadIdx.x].r                           = temp1.x;
+            sA[threadIdx.x].sr                          = temp1.y;
+            sA[threadIdx.x].bornRadiusScaleFactor       = temp1.w;
+            sA[threadIdx.x].sum             = apos.w    = 0.0f;
+
+#ifdef USE_CUTOFF
+            //unsigned int flags = cSim.pInteractionFlag[pos + (blockIdx.x*workUnits)/gridDim.x];
+            unsigned int flags = cSim.pInteractionFlag[pos];
+            if (flags == 0)
+            {
+                // No interactions in this block.
+            }
+            else if (flags == 0xFFFFFFFF)
+#endif
+            {
+                // Compute all interactions within this block.
+
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    dx                      = psA[tj].x - apos.x;
+                    dy                      = psA[tj].y - apos.y;
+                    dz                      = psA[tj].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    r2                      = dx * dx + dy * dy + dz * dz;
+#ifdef USE_PERIODIC
+                    if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
+#elif defined USE_CUTOFF
+                    if (r2 < cSim.nonbondedCutoffSqr)
+#endif
+                    {
+                        r                       = sqrt(r2);
+
+                        // psA[tj].sr = Sj
+                        // ar.x       = Ri
+
+                        apos.w                 += psA[tj].bornRadiusScaleFactor*getGBVI_Volume( r, ar.x,      psA[tj].sr );
+                        psA[tj].sum            += ar.w*getGBVI_Volume( r, psA[tj].r, ar.y );
+                    }
+                    tj = (tj - 1) & (GRID - 1);
+                }
+            }
+#ifdef USE_CUTOFF
+            else
+            {
+                // Compute only a subset of the interactions in this block.
+
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    if ((flags&(1<<j)) != 0)
+                    {
+                        tempBuffer[threadIdx.x] = 0.0f;
+                        dx                      = psA[j].x - apos.x;
+                        dy                      = psA[j].y - apos.y;
+                        dz                      = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                        dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                        dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                        dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                        r2                      = dx * dx + dy * dy + dz * dz;
+#ifdef USE_PERIODIC
+                        if (i < cSim.atoms && y+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
+#elif defined USE_CUTOFF
+                        if (r2 < cSim.nonbondedCutoffSqr)
+#endif
+                        {
+                            r                       = sqrt(r2);
+                            tempBuffer[threadIdx.x] = ar.w*getGBVI_Volume( r, psA[tj].r, ar.y );
+                        }
+
+                        // Sum the terms.
+
+                        if (tgx % 2 == 0)
+                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
+                        if (tgx % 4 == 0)
+                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
+                        if (tgx % 8 == 0)
+                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
+                        if (tgx % 16 == 0)
+                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
+                        if (tgx == 0)
+                            psA[j].sum += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
+                    }
+                }
+            }
+#endif
+
+            // Write results
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset = x + tgx + warp*cSim.stride;
+            cSim.pBornSum[offset] += apos.w;
+            offset = y + tgx + warp*cSim.stride;
+            cSim.pBornSum[offset] += sA[threadIdx.x].sum;
+#else
+            unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
+            cSim.pBornSum[offset] = apos.w;
+            offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pBornSum[offset] = sA[threadIdx.x].sum;
+#endif
+        }
+
+        pos++;
+    }
+}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateLocalSoftcoreForces.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateLocalSoftcoreForces.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "gputypes.h"
+#include "GpuLJ14Softcore.h"
+
+#include <cuda.h>
+
+extern __shared__ Vectors sV[];
+static __constant__ cudaGmxSimulation cSim;
+static __constant__ cudaFreeEnergySimulationNonbonded14 feSim;
+
+/* Cuda compiler on Windows does not recognized "static const float" values */
+#define LOCAL_HACK_PI 3.1415926535897932384626433832795
+
+#define DOT3(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z)
+
+#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
+{ \
+    dp          = DOT3(v1, v2); \
+    float norm1 = DOT3(v1, v1); \
+    float norm2 = DOT3(v2, v2); \
+    dp /= sqrt(norm1 * norm2); \
+    dp = min(dp, 1.0f); \
+    dp = max(dp, -1.0f); \
+}
+
+#define CROSS_PRODUCT(v1, v2, c) \
+    c.x = v1.y * v2.z - v1.z * v2.y; \
+    c.y = v1.z * v2.x - v1.x * v2.z; \
+    c.z = v1.x * v2.y - v1.y * v2.x;
+
+#define GETPREFACTORSGIVENANGLECOSINE(cosine, param, dEdR) \
+{ \
+   float angle          = acos(cosine); \
+   float deltaIdeal     = angle - (param.x * (LOCAL_HACK_PI / 180.0f)); \
+   dEdR                 = param.y * deltaIdeal; \
+}
+
+#define GETENERGYGIVENANGLECOSINE(cosine, param, dEdR) \
+{ \
+   float angle          = acos(cosine); \
+   float deltaIdeal     = angle - (param.x * (LOCAL_HACK_PI / 180.0f)); \
+   dEdR                 = param.y * deltaIdeal * deltaIdeal; \
+}
+
+#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
+{ \
+    float dp; \
+    GETNORMEDDOTPRODUCT(v1, v2, dp); \
+    angle = acos(dp); \
+}
+
+#define GETANGLECOSINEBETWEENTWOVECTORS(v1, v2, angle, cosine) \
+{ \
+    GETNORMEDDOTPRODUCT(v1, v2, cosine); \
+    angle = acos(cosine); \
+}
+
+#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
+{ \
+    CROSS_PRODUCT(vector1, vector2, cp0); \
+    CROSS_PRODUCT(vector2, vector3, cp1); \
+    GETANGLEBETWEENTWOVECTORS(cp0, cp1, angle); \
+    float dp = DOT3(signVector, cp1); \
+    angle = (dp >= 0) ? angle : -angle; \
+}                                                          
+
+#define GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle, cosine) \
+{ \
+    CROSS_PRODUCT(vector1, vector2, cp0); \
+    CROSS_PRODUCT(vector2, vector3, cp1); \
+    GETANGLECOSINEBETWEENTWOVECTORS(cp0, cp1, angle, cosine); \
+    float dp = DOT3(signVector, cp1); \
+    angle = (dp >= 0) ? angle : -angle; \
+}
+
+extern "C"
+void SetCalculateLocalSoftcoreGpuSim(gpuContext gpu)
+{
+    (void) fprintf( stderr, "SetCalculateLocalSoftcoreForcesSim called\n" );
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateLocalSoftcoreForcesSim copy to cSim failed");
+
+}
+
+static void SetCalculateLocalSoftcoreSim( GpuLJ14Softcore* gpuLJ14Softcore)
+{
+    cudaError_t status;
+
+    (void) fprintf( stderr, "SetCalculateLocalSoftcoreSim called\n" );
+    status = cudaMemcpyToSymbol(feSim, &gpuLJ14Softcore->feSim, sizeof(cudaFreeEnergySimulationNonbonded14));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateLocalSoftcoreSim copy to cSim failed");
+}
+
+void GetCalculateLocalSoftcoreForcesSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+    
+#define USE_SOFTCORE_LJ
+#ifdef USE_SOFTCORE_LJ
+#include "kSoftcoreLJ.h"
+#endif
+
+//__global__ void METHOD_NAME(kCalculateLocalSoftcore, Forces_kernel)()
+__global__ void kCalculateLocalSoftcoreForces_kernel()
+{
+    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
+    //Vectors* A       = &sV[threadIdx.x];
+
+    float energy = 0.0f;
+
+#if 0
+    while (pos < cSim.bond_offset)
+    {
+        if (pos < cSim.bonds)
+        {
+            int4   atom         = cSim.pBondID[pos];
+            float4 atomA        = cSim.pPosq[atom.x];
+            float4 atomB        = cSim.pPosq[atom.y];
+            float2 bond         = cSim.pBondParameter[pos];
+            float dx            = atomB.x - atomA.x;
+            float dy            = atomB.y - atomA.y;
+            float dz            = atomB.z - atomA.z;
+            float r2            = dx * dx + dy * dy + dz * dz;
+            float r             = sqrt(r2);
+            float deltaIdeal    = r - bond.x;
+/* E */     energy             += 0.5f * bond.y * deltaIdeal * deltaIdeal;
+            float dEdR          = bond.y * deltaIdeal;
+            dEdR                = (r > 0.0f) ? (dEdR / r) : 0.0f;
+//            printf("D: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f\n", dx, dy, dz, r, deltaIdeal, dEdR);
+            dx                 *= dEdR;
+            dy                 *= dEdR;
+            dz                 *= dEdR;
+            unsigned int offsetA                = atom.x + atom.z * cSim.stride;
+            unsigned int offsetB                = atom.y + atom.w * cSim.stride;
+            float4 forceA                       = cSim.pForce4[offsetA];
+            float4 forceB                       = cSim.pForce4[offsetB];
+            forceA.x                           += dx;
+            forceA.y                           += dy;
+            forceA.z                           += dz;
+            forceB.x                           -= dx;
+            forceB.y                           -= dy;
+            forceB.z                           -= dz;
+            cSim.pForce4[offsetA]               = forceA;
+            cSim.pForce4[offsetB]               = forceB;    
+        }
+        pos += blockDim.x * gridDim.x;
+    }
+
+    while (pos < cSim.bond_angle_offset)
+    {
+        unsigned int pos1   = pos - cSim.bond_offset;
+        if (pos1 < cSim.bond_angles)
+        {
+            int4   atom1            = cSim.pBondAngleID1[pos1];  
+            float2 bond_angle       = cSim.pBondAngleParameter[pos1];
+            float4 a1               = cSim.pPosq[atom1.x];
+            float4 a2               = cSim.pPosq[atom1.y];
+            float4 a3               = cSim.pPosq[atom1.z];
+            A->v0.x                 = a2.x - a1.x;
+            A->v0.y                 = a2.y - a1.y;
+            A->v0.z                 = a2.z - a1.z;
+            A->v1.x                 = a2.x - a3.x;
+            A->v1.y                 = a2.y - a3.y;
+            A->v1.z                 = a2.z - a3.z;
+            float3 cp;
+            CROSS_PRODUCT(A->v0, A->v1, cp);
+            float rp                = DOT3(cp, cp); //cx * cx + cy * cy + cz * cz;
+            rp                      = max(sqrt(rp), 1.0e-06f);
+            float r21               = DOT3(A->v0, A->v0); // dx1 * dx1 + dy1 * dy1 + dz1 * dz1;
+            float r23               = DOT3(A->v1, A->v1); // dx2 * dx2 + dy2 * dy2 + dz2 * dz2;
+            float dot               = DOT3(A->v0, A->v1); // dx1 * dx2 + dy1 * dy2 + dz1 * dz2;
+            float cosine            = dot / sqrt(r21 * r23);
+
+            float angle_energy;
+/* E */     GETENERGYGIVENANGLECOSINE(cosine, bond_angle, angle_energy);
+            energy                 += 0.5f*angle_energy;
+
+            float dEdR;
+            GETPREFACTORSGIVENANGLECOSINE(cosine, bond_angle, dEdR);
+            //printf("%11.4f %11.4f\n", cosine, dEdR);
+            float termA             =  dEdR / (r21 * rp);
+            float termC             = -dEdR / (r23 * rp);
+            float3 c21;
+            float3 c23;
+            CROSS_PRODUCT(A->v0, cp, c21);
+            CROSS_PRODUCT(A->v1, cp, c23);
+            c21.x                  *= termA;
+            c21.y                  *= termA;
+            c21.z                  *= termA;
+            c23.x                  *= termC;
+            c23.y                  *= termC;
+            c23.z                  *= termC;
+            int2 atom2              = cSim.pBondAngleID2[pos1];
+            unsigned int offset     = atom1.x + atom1.w * cSim.stride;
+            float4 force            = cSim.pForce4[offset]; 
+            force.x                += c21.x;
+            force.y                += c21.y;
+            force.z                += c21.z;
+            cSim.pForce4[offset]    = force;
+            offset                  = atom1.y + atom2.x * cSim.stride;
+            force                   = cSim.pForce4[offset];
+            force.x                -= (c21.x + c23.x);
+            force.y                -= (c21.y + c23.y);
+            force.z                -= (c21.z + c23.z);
+            cSim.pForce4[offset]    = force;
+            offset                  = atom1.z + atom2.y * cSim.stride;
+            force                   = cSim.pForce4[offset];
+            force.x                += c23.x;
+            force.y                += c23.y;
+            force.z                += c23.z;
+            cSim.pForce4[offset]    = force;
+        }
+        pos += blockDim.x * gridDim.x;
+    }
+
+    while (pos < cSim.dihedral_offset)
+    {
+        unsigned int pos1 = pos - cSim.bond_angle_offset;
+        if (pos1 < cSim.dihedrals)
+        {
+            int4   atom1        = cSim.pDihedralID1[pos1];  
+            float4 atomA        = cSim.pPosq[atom1.x];
+            float4 atomB        = cSim.pPosq[atom1.y];
+            float4 atomC        = cSim.pPosq[atom1.z];
+            float4 atomD        = cSim.pPosq[atom1.w];            
+            A->v0.x             = atomA.x - atomB.x;
+            A->v0.y             = atomA.y - atomB.y;
+            A->v0.z             = atomA.z - atomB.z;
+            A->v1.x             = atomC.x - atomB.x;
+            A->v1.y             = atomC.y - atomB.y;
+            A->v1.z             = atomC.z - atomB.z;
+            A->v2.x             = atomC.x - atomD.x;
+            A->v2.y             = atomC.y - atomD.y;
+            A->v2.z             = atomC.z - atomD.z; 
+            float3 cp0, cp1;
+            float dihedralAngle;
+            GETDIHEDRALANGLEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle);
+            float4 dihedral         = cSim.pDihedralParameter[pos1];
+            float deltaAngle        = dihedral.z * dihedralAngle - (dihedral.y * PI / 180.0f);
+
+	    // ATTENTION: This section leads to a divergent deltaAngle values wrt
+	    // forces and energies. We separate the case dihedral.z = n = 0, which
+	    // is treated by the calculation of energies via a harmonic potential
+/* E */     if (dihedral.z) energy += dihedral.x * (1.0f + cos(deltaAngle));
+/* E */     else
+	    {
+		float deltaAngle    = dihedralAngle - dihedral.y;
+		if (deltaAngle < -PI) deltaAngle += 2.0f * PI;
+		else if (deltaAngle > PI) deltaAngle -= 2.0f * PI;
+                energy             += dihedral.x * deltaAngle * deltaAngle;
+	    }
+
+            float sinDeltaAngle     = sin(deltaAngle);
+            float dEdAngle          = -dihedral.x * dihedral.z * sinDeltaAngle;
+            float normCross1        = DOT3(cp0, cp0);
+            float normBC            = sqrt(DOT3(A->v1, A->v1));
+            float4 ff;
+            ff.x                    = (-dEdAngle * normBC) / normCross1;
+            float normCross2        = DOT3(cp1, cp1);
+            ff.w                    = (dEdAngle * normBC) / normCross2;
+            float dp                = 1.0f / DOT3(A->v1, A->v1);
+            ff.y                    = DOT3(A->v0, A->v1) * dp;
+            ff.z                    = DOT3(A->v2, A->v1) * dp;
+            int4  atom2             = cSim.pDihedralID2[pos1];   
+            float3 internalF0;
+            float3 internalF3;
+            float3 s;
+            
+//            printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);  
+            unsigned int offset                 = atom1.x + atom2.x * cSim.stride;
+            float4 force                        = cSim.pForce4[offset]; 
+            internalF0.x                        = ff.x * cp0.x; 
+            force.x                            += internalF0.x;
+            internalF0.y                        = ff.x * cp0.y;
+            force.y                            += internalF0.y;
+            internalF0.z                        = ff.x * cp0.z;       
+            force.z                            += internalF0.z;
+            cSim.pForce4[offset]                = force;
+            
+            //printf("%4d - 0: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
+            offset                              = atom1.w + atom2.w * cSim.stride;
+            force                               = cSim.pForce4[offset];
+            internalF3.x                        = ff.w * cp1.x;
+            force.x                            += internalF3.x;
+            internalF3.y                        = ff.w * cp1.y;
+            force.y                            += internalF3.y;
+            internalF3.z                        = ff.w * cp1.z;
+            force.z                            += internalF3.z;
+            cSim.pForce4[offset]                = force;
+            
+           // printf("%4d - 3: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
+            s.x                                 = ff.y * internalF0.x - ff.z * internalF3.x;   
+            s.y                                 = ff.y * internalF0.y - ff.z * internalF3.y;  
+            s.z                                 = ff.y * internalF0.z - ff.z * internalF3.z;        
+            offset                              = atom1.y + atom2.y * cSim.stride;
+            force                               = cSim.pForce4[offset];
+            force.x                            += -internalF0.x + s.x;
+            force.y                            += -internalF0.y + s.y;
+            force.z                            += -internalF0.z + s.z;
+            cSim.pForce4[offset]                = force;
+            
+            //printf("%4d - 1: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
+            offset                              = atom1.z + atom2.z * cSim.stride;
+            force                               = cSim.pForce4[offset];
+            force.x                            += -internalF3.x - s.x;
+            force.y                            += -internalF3.y - s.y;
+            force.z                            += -internalF3.z - s.z;
+            cSim.pForce4[offset]                = force;
+            //printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
+        }
+        pos += blockDim.x * gridDim.x;
+    }
+
+    // Ryckaert Bellemans dihedrals
+    while (pos < cSim.rb_dihedral_offset)
+    {
+        unsigned int pos1 = pos - cSim.dihedral_offset;
+        if (pos1 < cSim.rb_dihedrals)
+        {
+            int4   atom1        = cSim.pRbDihedralID1[pos1];
+            float4 atomA        = cSim.pPosq[atom1.x];
+            float4 atomB        = cSim.pPosq[atom1.y];
+            float4 atomC        = cSim.pPosq[atom1.z];
+            float4 atomD        = cSim.pPosq[atom1.w];
+            A->v0.x             = atomA.x - atomB.x;
+            A->v0.y             = atomA.y - atomB.y;
+            A->v0.z             = atomA.z - atomB.z;
+            A->v1.x             = atomC.x - atomB.x;
+            A->v1.y             = atomC.y - atomB.y;
+            A->v1.z             = atomC.z - atomB.z;
+            A->v2.x             = atomC.x - atomD.x;
+            A->v2.y             = atomC.y - atomD.y;
+            A->v2.z             = atomC.z - atomD.z;
+            float3 cp0, cp1;
+            float dihedralAngle, cosPhi;
+      //      printf("%4d - 0 : %9.4f %9.4f %9.4f\n", pos1, A->v0.x, A->v0.y, A->v0.z);
+      //      printf("%4d - 1 : %9.4f %9.4f %9.4f\n", pos1, A->v1.x, A->v1.y, A->v1.z);
+      //      printf("%4d - 2 : %9.4f %9.4f %9.4f\n", pos1, A->v2.x, A->v2.y, A->v2.z);
+            GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle, cosPhi);
+            if (dihedralAngle < 0.0f )
+            {
+                dihedralAngle += PI;
+            }
+            else
+            {
+                dihedralAngle -= PI;
+            }
+            cosPhi                  = -cosPhi;
+         //   printf("%4d: %9.4f %9.4f\n", pos1, dihedralAngle, cosPhi);
+            float4 dihedral1        = cSim.pRbDihedralParameter1[pos1];
+            float2 dihedral2        = cSim.pRbDihedralParameter2[pos1];
+            float cosFactor         = cosPhi;
+            float dEdAngle          = -dihedral1.y;
+
+/* E */     float rb_energy         = dihedral1.x;
+            rb_energy              += dihedral1.y * cosFactor;
+        //    printf("%4d - 1: %9.4f %9.4f\n", pos1, dEdAngle, 1.0f);
+            dEdAngle               -= 2.0f * dihedral1.z * cosFactor;
+       //     printf("%4d - 2: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
+            cosFactor              *= cosPhi;
+            dEdAngle               -= 3.0f * dihedral1.w * cosFactor;
+            rb_energy              += dihedral1.z * cosFactor;
+    //       printf("%4d - 3: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
+            cosFactor              *= cosPhi;
+            dEdAngle               -= 4.0f * dihedral2.x * cosFactor;
+            rb_energy              += dihedral1.w * cosFactor;
+  //         printf("%4d - 4: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
+            cosFactor              *= cosPhi;
+            dEdAngle               -= 5.0f * dihedral2.y * cosFactor;
+            rb_energy              += dihedral2.x * cosFactor;
+            rb_energy              += dihedral2.y * cosFactor * cosPhi;
+/* E */     energy                 += rb_energy;
+ //           printf("%4d - 5: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
+            dEdAngle               *= sin(dihedralAngle);
+//            printf("%4d - f: %9.4f\n", pos1, dEdAngle);
+
+            float normCross1        = DOT3(cp0, cp0);
+            float normBC            = sqrt(DOT3(A->v1, A->v1));
+            float4 ff;
+            ff.x                    = (-dEdAngle * normBC) / normCross1;
+            float normCross2        = DOT3(cp1, cp1);
+            ff.w                    = (dEdAngle * normBC) / normCross2;
+            float dp                = 1.0f / DOT3(A->v1, A->v1);
+            ff.y                    = DOT3(A->v0, A->v1) * dp;
+            ff.z                    = DOT3(A->v2, A->v1) * dp;
+            int4  atom2             = cSim.pRbDihedralID2[pos1];
+            float3 internalF0;
+            float3 internalF3;
+            float3 s;
+
+//            printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
+            unsigned int offset                 = atom1.x + atom2.x * cSim.stride;
+            float4 force                        = cSim.pForce4[offset];
+            internalF0.x                        = ff.x * cp0.x;
+            force.x                            += internalF0.x;
+            internalF0.y                        = ff.x * cp0.y;
+            force.y                            += internalF0.y;
+            internalF0.z                        = ff.x * cp0.z;
+            force.z                            += internalF0.z;
+            cSim.pForce4[offset]                = force;
+
+ //           printf("%4d - 0: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
+            offset                              = atom1.w + atom2.w * cSim.stride;
+            force                               = cSim.pForce4[offset];
+            internalF3.x                        = ff.w * cp1.x;
+            force.x                            += internalF3.x;
+            internalF3.y                        = ff.w * cp1.y;
+            force.y                            += internalF3.y;
+            internalF3.z                        = ff.w * cp1.z;
+            force.z                            += internalF3.z;
+            cSim.pForce4[offset]                = force;
+
+   //         printf("%4d - 3: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
+            s.x                                 = ff.y * internalF0.x - ff.z * internalF3.x;
+            s.y                                 = ff.y * internalF0.y - ff.z * internalF3.y;
+            s.z                                 = ff.y * internalF0.z - ff.z * internalF3.z;
+            offset                              = atom1.y + atom2.y * cSim.stride;
+            force                               = cSim.pForce4[offset];
+            force.x                            += -internalF0.x + s.x;
+            force.y                            += -internalF0.y + s.y;
+            force.z                            += -internalF0.z + s.z;
+            cSim.pForce4[offset]                = force;
+     //       printf("%4d - 1: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
+            offset                              = atom1.z + atom2.z * cSim.stride;
+            force                               = cSim.pForce4[offset];
+            force.x                            += -internalF3.x - s.x;
+            force.y                            += -internalF3.y - s.y;
+            force.z                            += -internalF3.z - s.z;
+            cSim.pForce4[offset]                = force;
+     //       printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
+        }         
+        pos += blockDim.x * gridDim.x;
+    }   
+#endif
+
+    if (cSim.nonbondedMethod == NO_CUTOFF)
+    {
+        while (pos < feSim.LJ14_offset)
+        {
+            //unsigned int pos1       = pos - feSim.rb_dihedral_offset;
+            unsigned int pos1       = pos;
+            if (pos1 < feSim.LJ14s)
+            {
+                int4 atom               = feSim.pLJ14ID[pos1];
+                float4 LJ14             = feSim.pLJ14Parameter[pos1];
+                float4 a1               = cSim.pPosq[atom.x];
+                float4 a2               = cSim.pPosq[atom.y];
+                float3 d;
+                d.x                     = a1.x - a2.x;
+                d.y                     = a1.y - a2.y;
+                d.z                     = a1.z - a2.z;
+                float r2                = DOT3(d, d);
+                float inverseR          = 1.0f / sqrt(r2);
+#ifdef USE_SOFTCORE_LJ
+                float CDLJ_energy       = 0.0f;
+                float dEdR              = getSoftCoreLJ( r2, LJ14.y, LJ14.x, LJ14.w, LJ14.w, &CDLJ_energy );
+                energy                 += CDLJ_energy;
+#else
+                float sig2              = inverseR * LJ14.y;
+                sig2                   *= sig2;
+                float sig6              = sig2 * sig2 * sig2;
+                float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
+                energy                 += LJ14.x * (sig6 - 1.0f) * sig6;
+#endif
+                energy                 += LJ14.z * inverseR;
+                dEdR                   += LJ14.z * inverseR;
+                dEdR                   *= inverseR * inverseR;
+                unsigned int offsetA    = atom.x + atom.z * cSim.stride;
+                unsigned int offsetB    = atom.y + atom.w * cSim.stride;
+                float4 forceA           = cSim.pForce4[offsetA];
+                float4 forceB           = cSim.pForce4[offsetB];
+                d.x                    *= dEdR;
+                d.y                    *= dEdR;
+                d.z                    *= dEdR;
+                forceA.x               += d.x;
+                forceA.y               += d.y;
+                forceA.z               += d.z;
+                forceB.x               -= d.x;
+                forceB.y               -= d.y;
+                forceB.z               -= d.z;
+                cSim.pForce4[offsetA]   = forceA;
+                cSim.pForce4[offsetB]   = forceB;
+            }
+            pos                    += blockDim.x * gridDim.x;
+        }
+    }
+    else if (cSim.nonbondedMethod == CUTOFF)
+    {
+        float LJ14_energy;
+        while (pos < feSim.LJ14_offset)
+        {
+            //unsigned int pos1       = pos - feSim.rb_dihedral_offset;
+            unsigned int pos1       = pos;
+            if (pos1 < feSim.LJ14s)
+            {
+                int4 atom               = feSim.pLJ14ID[pos1];
+                float4 LJ14             = feSim.pLJ14Parameter[pos1];
+                float4 a1               = cSim.pPosq[atom.x];
+                float4 a2               = cSim.pPosq[atom.y];
+                float3 d;
+                d.x                     = a1.x - a2.x;
+                d.y                     = a1.y - a2.y;
+                d.z                     = a1.z - a2.z;
+                float r2                = DOT3(d, d);
+                float inverseR          = 1.0f / sqrt(r2);
+#ifdef USE_SOFTCORE_LJ
+                float dEdR              = getSoftCoreLJ( r2, LJ14.y, LJ14.x, LJ14.w, LJ14.w, &LJ14_energy);
+#else
+                float sig2              = inverseR * LJ14.y;
+                sig2                   *= sig2;
+                float sig6              = sig2 * sig2 * sig2;
+                float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;                
+                /* E */
+                LJ14_energy             = LJ14.x * (sig6 - 1.0f) * sig6;
+#endif
+                LJ14_energy            += LJ14.z * (inverseR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
+                dEdR                   += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
+                dEdR                   *= inverseR * inverseR;
+                if (r2 > cSim.nonbondedCutoffSqr)
+                {                   
+                    dEdR = 0.0f;
+                    /* E */
+                    LJ14_energy = 0.0f;
+                }
+                /* E */
+                energy                 += LJ14_energy;
+ 
+                unsigned int offsetA    = atom.x + atom.z * cSim.stride;
+                unsigned int offsetB    = atom.y + atom.w * cSim.stride;
+                float4 forceA           = cSim.pForce4[offsetA];
+                float4 forceB           = cSim.pForce4[offsetB];
+                d.x                    *= dEdR;
+                d.y                    *= dEdR;
+                d.z                    *= dEdR;
+                forceA.x               += d.x;
+                forceA.y               += d.y;
+                forceA.z               += d.z;
+                forceB.x               -= d.x;
+                forceB.y               -= d.y;
+                forceB.z               -= d.z;
+                cSim.pForce4[offsetA]   = forceA;
+                cSim.pForce4[offsetB]   = forceB;
+            }
+            pos                    += blockDim.x * gridDim.x;
+        }
+    }
+    else if (cSim.nonbondedMethod == PERIODIC)
+    {
+        float LJ14_energy;
+        while (pos < feSim.LJ14_offset)
+        {
+            //unsigned int pos1       = pos - feSim.rb_dihedral_offset;
+            unsigned int pos1       = pos;
+            if (pos1 < feSim.LJ14s)
+            {
+                int4 atom               = feSim.pLJ14ID[pos1];
+                float4 LJ14             = feSim.pLJ14Parameter[pos1];
+                float4 a1               = cSim.pPosq[atom.x];
+                float4 a2               = cSim.pPosq[atom.y];
+                float3 d;
+                d.x                     = a1.x - a2.x;
+                d.y                     = a1.y - a2.y;
+                d.z                     = a1.z - a2.z;
+                d.x                     -= floor(d.x/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                d.y                     -= floor(d.y/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                d.z                     -= floor(d.z/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+                float r2                = DOT3(d, d);
+                float inverseR          = 1.0f / sqrt(r2);
+#ifdef USE_SOFTCORE_LJ
+                float dEdR              = getSoftCoreLJ( r2, LJ14.y, LJ14.x, LJ14.w, LJ14.w, &LJ14_energy);
+#else
+                float sig2              = inverseR * LJ14.y;
+                sig2                   *= sig2;
+                float sig6              = sig2 * sig2 * sig2;
+                float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
+                /* E */
+                LJ14_energy             = LJ14.x * (sig6 - 1.0f) * sig6;
+#endif
+                LJ14_energy            += LJ14.z * (inverseR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
+
+                dEdR                   += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
+                dEdR                   *= inverseR * inverseR;
+                if (r2 > cSim.nonbondedCutoffSqr)
+                {
+                    dEdR = 0.0f;
+                    /* E */
+                    LJ14_energy = 0.0f;
+                }
+                /* E */
+                energy                 += LJ14_energy;
+
+                unsigned int offsetA    = atom.x + atom.z * cSim.stride;
+                unsigned int offsetB    = atom.y + atom.w * cSim.stride;
+                float4 forceA           = cSim.pForce4[offsetA];
+                float4 forceB           = cSim.pForce4[offsetB];
+                d.x                    *= dEdR;
+                d.y                    *= dEdR;
+                d.z                    *= dEdR;
+                forceA.x               += d.x;
+                forceA.y               += d.y;
+                forceA.z               += d.z;
+                forceB.x               -= d.x;
+                forceB.y               -= d.y;
+                forceB.z               -= d.z;
+                cSim.pForce4[offsetA]   = forceA;
+                cSim.pForce4[offsetB]   = forceB;
+            }
+            pos                    += blockDim.x * gridDim.x;
+        }
+    }
+    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy;
+}
+
+extern "C"
+GpuLJ14Softcore* gpuSetLJ14SoftcoreParameters(gpuContext gpu, float epsfac, float fudge, const std::vector<int>& atom1, const std::vector<int>& atom2,
+                                              const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q1,
+                                              const std::vector<float>& q2, const std::vector<float>& softcoreLJLambdaArray)
+{
+    int LJ14s                                   = atom1.size();
+    float scale                                 = epsfac * fudge;
+
+    GpuLJ14Softcore* gpuLJ14Softcore            = new GpuLJ14Softcore();
+    gpuLJ14Softcore->feSim.LJ14s                = LJ14s;
+
+    CUDAStream<int4>* psLJ14ID                  = new CUDAStream<int4>(LJ14s, 1, "LJ14SoftcoreID");
+    gpuLJ14Softcore->psLJ14SoftcoreID           = psLJ14ID;
+    gpuLJ14Softcore->feSim.pLJ14ID              = psLJ14ID->_pDevStream[0];
+
+    CUDAStream<float4>* psLJ14Parameter         = new CUDAStream<float4>(LJ14s, 1, "LJ14SoftcoreParameter");
+    gpuLJ14Softcore->psLJ14SoftcoreParameter    = psLJ14Parameter;
+    gpuLJ14Softcore->feSim.pLJ14Parameter       = psLJ14Parameter->_pDevStream[0];
+    gpuLJ14Softcore->feSim.LJ14_offset          = LJ14s;
+
+    for (int i = 0; i < LJ14s; i++)
+    {
+        (*psLJ14ID)[i].x          = atom1[i];
+        (*psLJ14ID)[i].y          = atom2[i];
+        psLJ14ID->_pSysData[i].z  = gpu->pOutputBufferCounter[psLJ14ID->_pSysData[i].x]++;
+        psLJ14ID->_pSysData[i].w  = gpu->pOutputBufferCounter[psLJ14ID->_pSysData[i].y]++;
+        float p0, p1, p2, p3;
+        if (c12[i] == 0.0f)
+        {
+            p0 = 0.0f;
+            p1 = 1.0f;
+        }
+        else
+        {
+            p0 = c6[i] * c6[i] / c12[i];
+            p1 = pow(c12[i] / c6[i], 1.0f / 6.0f);
+        }
+        p2 = scale * q1[i] * q2[i];
+        p3 = softcoreLJLambdaArray[i];
+        (*psLJ14Parameter)[i].x = p0;
+        (*psLJ14Parameter)[i].y = p1;
+        (*psLJ14Parameter)[i].z = p2;
+        (*psLJ14Parameter)[i].w = p3;
+    }
+#if (DUMP_PARAMETERS == 1)
+        cout << 
+            i << " " <<
+            (*psLJ14ID)[i].x << " " <<
+            (*psLJ14ID)[i].y << " " <<
+            (*psLJ14ID)[i].z << " " <<
+            (*psLJ14ID)[i].w << " " <<
+            (*psLJ14Parameter)[i].x << " " <<
+            (*psLJ14Parameter)[i].y << " " <<
+            (*psLJ14Parameter)[i].z << " " <<
+            (*psLJ14Parameter)[i].w << " " <<
+            p0 << " " << 
+            p1 << " " << 
+            p2 << " " << 
+            p3 << " " << 
+            endl;
+#endif
+    psLJ14ID->Upload();
+    psLJ14Parameter->Upload();
+    SetCalculateLocalSoftcoreSim( gpuLJ14Softcore );
+
+    return gpuLJ14Softcore;
+}
+
+void kCalculateLocalSoftcoreForces(gpuContext gpu)
+{
+  //  printf("kCalculateLocalForces\n");
+//    fprintf( stderr, "kCalculateLocalSoftcoreForces blks=%u localForces_threads_per_block=%u szVector=%u total=%u\n", gpu->sim.blocks, gpu->sim.localForces_threads_per_block, sizeof(Vectors),
+//             gpu->sim.localForces_threads_per_block * sizeof(Vectors) ); fflush( stderr );
+
+    kCalculateLocalSoftcoreForces_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block, gpu->sim.localForces_threads_per_block * sizeof(Vectors)>>>();
+    LAUNCHERROR("kCalculateLocalSoftcoreForces");
+}
+
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateNonbondedSoftcore.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateNonbondedSoftcore.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "GpuNonbondedSoftcore.h"
+#include "GpuFreeEnergyCudaKernels.h"
+
+// structure containing array of softcore lambdas
+
+struct cudaFreeEnergySimulationNonBonded {
+    float* pParticleSoftCoreLJLambda;
+};
+struct cudaFreeEnergySimulationNonBonded feSim;
+
+// device handles
+
+static __constant__ cudaGmxSimulation cSim;
+static __constant__ cudaFreeEnergySimulationNonBonded feSimDev;
+
+// write address of structs to devices
+
+void SetCalculateCDLJSoftcoreGpuSim( gpuContext gpu )
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+
+    (void) fprintf( stderr, "SetCalculateCDLJSoftcoreGpuSim gpu=%p cSim=%p sizeof=%u\n", gpu, &gpu->sim, sizeof(cudaGmxSimulation) ); fflush( stderr );
+}
+
+void SetCalculateCDLJSoftcoreSupplementarySim( float* gpuParticleSoftCoreLJLambda)
+{
+    cudaError_t status;
+    feSim.pParticleSoftCoreLJLambda = gpuParticleSoftCoreLJLambda;
+    status = cudaMemcpyToSymbol(feSimDev, &feSim, sizeof(cudaFreeEnergySimulationNonBonded));
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateCDLJSoftcoreSupplementarySim");
+
+    (void) fprintf( stderr, "SetCalculateCDLJSoftcoreSupplementarySim\n" );
+}
+
+void GetCalculateCDLJSoftcoreForcesSim(float* gpuParticleSoftCoreLJLambda)
+{
+//    cudaError_t status;
+//    status = cudaMemcpyFromSymbol(gpuParticleSoftCoreLJLambda, particleSoftCoreLJLambdaDev, sizeof(float*));
+//    RTERROR(status, "cudaMemcpyFromSymbol: GetCalculateCDLJSoftcoreForcesSim failed");
+}
+
+// create, initialize and entrt SoftCoreLJLambda values
+// return handle to GpuNonbondedSoftcore object
+
+extern "C"
+GpuNonbondedSoftcore* gpuSetNonbondedSoftcoreParameters(gpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6,
+                                                        const std::vector<float>& c12, const std::vector<float>& q,
+                                                        const std::vector<float>& softcoreLJLambdaArray, const std::vector<char>& symbol,
+                                                        const std::vector<std::vector<int> >& exclusions, CudaNonbondedMethod method)
+{
+    unsigned int numberOfParticles     = c6.size();
+    gpu->sim.epsfac                    = epsfac;
+    gpu->sim.nonbondedMethod           = method;
+    if (numberOfParticles > 0)
+        setExclusions(gpu, exclusions);
+    
+    // create gpuNonbondedSoftcore
+
+    GpuNonbondedSoftcore* gpuNonbondedSoftcore = new GpuNonbondedSoftcore();
+    gpuNonbondedSoftcore->initializeParticleSoftCoreLJLambda( numberOfParticles );
+    float minSoftcore                          = 1.0e+10;
+    for (unsigned int i = 0; i < numberOfParticles; i++)
+    {
+            float p0               = q[i];
+
+            // track min softcore value
+
+            float softcoreLJLambda = softcoreLJLambdaArray[i];
+            if( minSoftcore > softcoreLJLambda ){
+                minSoftcore = softcoreLJLambda;
+            }
+            gpuNonbondedSoftcore->setParticleSoftCoreLJLambda( i, softcoreLJLambda );
+
+            float p1 = 0.5f, p2 = 0.0f;               
+            if ((c6[i] > 0.0f) && (c12[i] > 0.0f))
+            {
+                p1 = 0.5f * pow(c12[i] / c6[i], 1.0f / 6.0f);
+                p2 = c6[i] * sqrt(1.0f / c12[i]);
+            }
+            if (symbol.size() > 0)
+                gpu->pAtomSymbol[i] = symbol[i];
+
+            (*gpu->psPosq4)[i].w          = p0;
+            (*gpu->psSigEps2)[i].x        = p1;
+            (*gpu->psSigEps2)[i].y        = p2;
+    }
+    gpuNonbondedSoftcore->setSoftCoreLJLambda( minSoftcore );
+
+    // Dummy out extra atom data
+    for (unsigned int i = numberOfParticles; i < gpu->sim.paddedNumberOfAtoms; i++)
+    {
+        (*gpu->psPosq4)[i].x                = 100000.0f + i * 10.0f;
+        (*gpu->psPosq4)[i].y                = 100000.0f + i * 10.0f;
+        (*gpu->psPosq4)[i].z                = 100000.0f + i * 10.0f;
+        (*gpu->psPosq4)[i].w                = 0.0f;
+        (*gpu->psSigEps2)[i].x              = 0.0f;
+        (*gpu->psSigEps2)[i].y              = 0.0f;
+    }
+
+#undef DUMP_PARAMETERS
+#define DUMP_PARAMETERS 1
+#if (DUMP_PARAMETERS == 1)
+    (void) fprintf( stderr,"gpuSetNonbondedSoftcoreParameters: %5u epsfac=%14.7e method=%d\n", numberOfParticles, gpu->sim.paddedNumberOfAtoms, epsfac, method );
+    int maxPrint = 31;
+    for (unsigned int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++){
+        (void) fprintf( stderr,"%6u x[%14.7e %14.7e %14.7e %14.7e] sig[%14.7e %14.7e]\n",
+                        ii, (*gpu->psPosq4)[ii].x, (*gpu->psPosq4)[ii].y, (*gpu->psPosq4)[ii].z, (*gpu->psPosq4)[ii].w,
+                        (*gpu->psSigEps2)[ii].x, (*gpu->psSigEps2)[ii].y );
+        if( ii == maxPrint && ii < gpu->sim.paddedNumberOfAtoms - maxPrint ){
+           ii = gpu->sim.paddedNumberOfAtoms - maxPrint;
+        }
+    }
+#endif
+ 
+    // upload data to board
+
+    gpuNonbondedSoftcore->upload( gpu );
+
+    gpu->psPosq4->Upload();
+    gpu->psSigEps2->Upload();
+
+    return gpuNonbondedSoftcore;
+}
+
+// delete gpuNonbondedSoftcore
+
+extern "C"
+void gpuDeleteNonbondedSoftcoreParameters( void* gpuNonbondedSoftcore)
+{
+    GpuNonbondedSoftcore* internalGNonbondedSoftcore = static_cast<GpuNonbondedSoftcore*>(gpuNonbondedSoftcore);
+    delete internalGNonbondedSoftcore;
+}
+
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float q;
+    float sig;
+    float eps;
+    float softCoreLJLambda;
+    float fx;
+    float fy;
+    float fz;
+};
+
+#if 0
+texture<float, 1, cudaReadModeElementType> tabulatedErfcRef;
+
+__device__ float fastErfc(float r)
+{
+    float normalized = cSim.tabulatedErfcScale*r;
+    int index = (int) normalized;
+    float fract2 = normalized-index;
+    float fract1 = 1.0f-fract2;
+    return fract1*tex1Dfetch(tabulatedErfcRef, index) + fract2*tex1Dfetch(tabulatedErfcRef, index+1);
+}
+
+// Include versions of the kernels for N^2 calculations.
+
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateNonbondedSoftcore.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateNonbondedSoftcore.h"
+
+#endif
+
+// Include versions of the kernels for N^2 calculations with softcore LJ.
+
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2SoftcoreLJ##b
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_SOFTCORE_LJ
+#include "kCalculateNonbondedSoftcore.h"
+
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2SoftcoreLJByWarp##b
+#include "kCalculateNonbondedSoftcore.h"
+#undef USE_SOFTCORE_LJ
+
+// Include versions of the kernels with cutoffs.
+
+#if 0
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_CUTOFF
+#define METHOD_NAME(a, b) a##Cutoff##b
+#include "kCalculateNonbondedSoftcore.h"
+#include "kFindInteractingBlocks.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
+#include "kCalculateNonbondedSoftcore.h"
+
+// Include versions of the kernels with periodic boundary conditions.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_PERIODIC
+#define METHOD_NAME(a, b) a##Periodic##b
+#include "kCalculateNonbondedSoftcore.h"
+#include "kFindInteractingBlocks.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##PeriodicByWarp##b
+#include "kCalculateNonbondedSoftcore.h"
+
+// Include versions of the kernels for Ewald
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_PERIODIC
+#define USE_EWALD
+#define METHOD_NAME(a, b) a##Ewald##b
+#include "kCalculateNonbondedSoftcore.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##EwaldByWarp##b
+#include "kCalculateNonbondedSoftcore.h"
+
+// Reciprocal Space Ewald summation is in a separate kernel
+#include "kCalculateCDLJEwaldFastReciprocal.h"
+
+void kCalculatePME(gpuContext gpu);
+#endif
+
+void kCalculateCDLJSoftcoreForces(gpuContext gpu )
+{
+
+    //printf("kCalculateCDLJCutoffForces %d\n", gpu->sim.nonbondedMethod); fflush( stdout );
+    switch (gpu->sim.nonbondedMethod)
+    {
+        case NO_CUTOFF:
+           if (gpu->bOutputBufferPerWarp)
+                kCalculateCDLJSoftcoreN2SoftcoreLJByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                         sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
+           else
+                   kCalculateCDLJSoftcoreN2SoftcoreLJForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                           sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit );
+//(gpu->sim.pWorkUnit, gpuNonbondedSoftcore->getGpuParticleSoftCoreLJLambda());
+            LAUNCHERROR("kCalculateCDLJSoftcoreN2Forces");
+
+#if 0
+int maxPrint = 31; 
+gpu->psWorkUnit->Download();
+fprintf( stderr, "kCalculateCDLJSoftcoreForces: bOutputBufferPerWarp=%u blks=%u th/blk=%u wu=%u %u shrd=%u\n", gpu->bOutputBufferPerWarp,
+                 gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, gpu->sim.workUnits, gpu->psWorkUnit->_pSysStream[0][0],
+        sizeof(Atom)*gpu->sim.nonbond_threads_per_block );
+
+               gpu->psPosq4->Download();
+
+                (void) fprintf( stderr, "\nkCalculateGBVISoftcoreBornSum: pre BornSum %s Born radii & params\n",
+                               (gpu->bIncludeGBVI ? "GBVI" : "Obc") );
+                for( int ii = 0; ii < gpu->natoms; ii++ ){
+                   (void) fprintf( stderr, "%6d bSum=%14.6e param[%14.6e %14.6e %14.6e] x[%14.6f %14.6f %14.6f %14.6f]\n",
+                                   ii,
+                                   gpu->psBornSum->_pSysStream[0][ii],
+                                   gpu->psGBVIData->_pSysStream[0][ii].x,
+                                   gpu->psGBVIData->_pSysStream[0][ii].y,
+                                   gpu->psGBVIData->_pSysStream[0][ii].z,
+                                   gpu->psPosq4->_pSysStream[0][ii].x, gpu->psPosq4->_pSysStream[0][ii].y,
+                                   gpu->psPosq4->_pSysStream[0][ii].z, gpu->psPosq4->_pSysStream[0][ii].w
+                                 );
+                   if( (ii == maxPrint) && ( ii < (gpu->natoms - maxPrint)) ){
+                      ii = gpu->natoms - maxPrint;
+                   }
+                }
+
+#endif
+#undef GBVI
+
+
+            break;
+#if 0
+        case CUTOFF:
+            kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsCutoff");
+            kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
+            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
+            kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+#if 0
+    static int iteration = 0;
+    if (iteration >= 0)
+    {
+        gpu->psInteractingWorkUnit->Download();
+        gpu->psInteractionCount->Download();
+/*
+    unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    unsigned int numWorkUnits = cSim.pInteractionCount[0];
+    unsigned int pos = warp*numWorkUnits/totalWarps;
+    unsigned int end = (warp+1)*numWorkUnits/totalWarps;
+*/
+ 
+        printf("# Post kCalculateCDLJCutoffForces %d atoms warps=%d cnt=%u bOutputBufferPerWarp=%d zC=%d\n", 
+                gpu->natoms, ((gpu->sim.nonbond_blocks*gpu->sim.nonbond_threads_per_block)/GRID),
+                gpu->psInteractionCount->_pSysStream[0][0], gpu->bOutputBufferPerWarp,
+                (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block);
+        fflush( stdout );
+        for (int i = 0; i < gpu->psInteractingWorkUnit->_stride; i++)
+        {
+            printf("%5d %u\n", i, gpu->psInteractingWorkUnit->_pSysStream[0][i] );
+            fflush( stdout );
+        }
+    }
+    iteration++;
+#endif
+
+
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCDLJCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            else
+                kCalculateCDLJCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            LAUNCHERROR("kCalculateCDLJCutoffForces");
+            break;
+        case PERIODIC:
+            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsPeriodic");
+            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
+            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
+            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            else
+                kCalculateCDLJPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            LAUNCHERROR("kCalculateCDLJPeriodicForces");
+            break;
+        case EWALD:
+        case PARTICLE_MESH_EWALD:
+            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsPeriodic");
+            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
+            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
+            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
+            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+            cudaBindTexture(NULL, &tabulatedErfcRef, gpu->psTabulatedErfc->_pDevData, &channelDesc, gpu->psTabulatedErfc->_length*sizeof(float));
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCDLJEwaldByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            else
+                kCalculateCDLJEwaldForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            LAUNCHERROR("kCalculateCDLJEwaldForces");
+            if (gpu->sim.nonbondedMethod == EWALD)
+            {
+                kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+                LAUNCHERROR("kCalculateEwaldFastCosSinSums");
+                kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+                LAUNCHERROR("kCalculateEwaldFastForces");
+            }
+            else
+                kCalculatePME(gpu);
+#endif
+    }
+}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateNonbondedSoftcore.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateNonbondedSoftcore.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This file contains the kernels for evalauating nonbonded forces.  It is included
+ * several times in kCalculateCDLJForces.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+
+#ifdef USE_SOFTCORE_LJ
+#include "kSoftcoreLJ.h"
+#endif
+
+/* Cuda compiler on Windows does not recognized "static const float" values */
+#define LOCAL_HACK_PI 3.1415926535897932384626433832795f
+
+//__global__ void METHOD_NAME(kCalculateCDLJSoftcore, Forces_kernel)(unsigned int* workUnit, float* softCoreLJLambdaArray)
+__global__ void METHOD_NAME(kCalculateCDLJSoftcore, Forces_kernel)(unsigned int* workUnit )
+{
+    extern __shared__ Atom sA[];
+    unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    unsigned int numWorkUnits = cSim.pInteractionCount[0];
+    unsigned int pos = warp*numWorkUnits/totalWarps;
+    unsigned int end = (warp+1)*numWorkUnits/totalWarps;
+    float CDLJ_energy;
+    float energy = 0.0f;
+#ifdef USE_CUTOFF
+    float3* tempBuffer = (float3*) &sA[cSim.nonbond_threads_per_block];
+#endif
+
+#ifdef USE_EWALD
+    const float TWO_OVER_SQRT_PI = 2.0f/sqrt(LOCAL_HACK_PI);
+#endif
+
+    unsigned int lasty = 0xFFFFFFFF;
+    while (pos < end)
+    {
+
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x = workUnit[pos];
+        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
+        bool bExclusionFlag = (x & 0x1);
+        x = (x >> 17) << GRIDBITS;
+        float4      apos;   // Local atom x, y, z, q
+        float3      af;     // Local atom fx, fy, fz
+        float dx;
+        float dy;
+        float dz;
+        float r2;
+        float invR;
+        float sig;
+        float eps;
+        float dEdR;
+        unsigned int tgx = threadIdx.x & (GRID - 1);
+        unsigned int tbx = threadIdx.x - tgx;
+        unsigned int tj = tgx;
+        Atom* psA = &sA[tbx];
+        unsigned int i               = x + tgx;
+        apos                         = cSim.pPosq[i];
+        float2 a                     = cSim.pAttr[i];
+        //float softCoreLJLambda       = cSim.pSoftCoreLJLambda[i];
+        //float softCoreLJLambda       = softCoreLJLambdaArray[i];
+        float softCoreLJLambda       = feSimDev.pParticleSoftCoreLJLambda[i];
+        af.x                         = 0.0f;
+        af.y                         = 0.0f;
+        af.z                         = 0.0f;
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        {
+            // Read fixed atom data into registers and GRF
+            sA[threadIdx.x].x                     = apos.x;
+            sA[threadIdx.x].y                     = apos.y;
+            sA[threadIdx.x].z                     = apos.z;
+            sA[threadIdx.x].q                     = apos.w;
+            sA[threadIdx.x].sig                   = a.x;
+            sA[threadIdx.x].eps                   = a.y;
+            sA[threadIdx.x].softCoreLJLambda      = softCoreLJLambda;
+            apos.w                               *= cSim.epsfac;
+            if (!bExclusionFlag)
+            {
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    dx              = psA[j].x - apos.x;
+                    dy              = psA[j].y - apos.y;
+                    dz              = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    r2              = dx * dx + dy * dy + dz * dz;
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[j].sig;
+                    eps             = a.y * psA[j].eps;
+#ifdef USE_SOFTCORE_LJ
+                    dEdR            = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJ_energy );
+#else
+                    float sig2      = invR * sig;
+                    sig2           *= sig2;
+                    float sig6      = sig2 * sig2 * sig2;
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
+	          	     CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
+#endif
+
+#ifdef USE_CUTOFF
+    #ifdef USE_EWALD
+                    float r         = sqrt(r2);
+                    float alphaR    = cSim.alphaEwald * r;
+                    float erfcAlphaR = fastErfc(alphaR);
+                    dEdR           += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI );
+		              /* E */
+                    CDLJ_energy    += apos.w * psA[j].q * invR * erfcAlphaR;
+    #else
+                    dEdR           += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+		              /* E */
+		              CDLJ_energy    += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
+    #endif
+#else
+                    dEdR           += apos.w * psA[j].q * invR;
+		              /* E */
+		              CDLJ_energy    += apos.w * psA[j].q * invR;
+#endif
+                    dEdR           *= invR * invR;
+#ifdef USE_CUTOFF
+                    if (r2 > cSim.nonbondedCutoffSqr)
+                    {
+                        dEdR = 0.0f;
+                        /* E */
+                        CDLJ_energy = 0.0f;
+                    }
+#endif
+		              /* E */
+		              energy         += 0.5f*CDLJ_energy;
+                    dx             *= dEdR;
+                    dy             *= dEdR;
+                    dz             *= dEdR;
+                    af.x           -= dx;
+                    af.y           -= dy;
+                    af.z           -= dz;
+                }
+            }
+            else  // bExclusion
+            {
+                unsigned int xi   = x>>GRIDBITS;
+                unsigned int cell          = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
+                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    dx              = psA[j].x - apos.x;
+                    dy              = psA[j].y - apos.y;
+                    dz              = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    r2              = dx * dx + dy * dy + dz * dz;
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[j].sig;
+                    eps             = a.y * psA[j].eps;
+#ifdef USE_SOFTCORE_LJ
+                    dEdR            = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJ_energy );
+#else
+                    float sig2      = invR * sig;
+                    sig2           *= sig2;
+                    float sig6      = sig2 * sig2 * sig2;
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
+		              /* E */
+                    CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
+#endif
+
+#ifdef USE_CUTOFF
+    #ifdef USE_EWALD
+                    float r         = sqrt(r2);
+                    float alphaR    = cSim.alphaEwald * r;
+                    float erfcAlphaR = fastErfc(alphaR);
+                    dEdR           += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
+                    /* E */
+		              CDLJ_energy    += apos.w * psA[j].q * invR * erfcAlphaR;
+                    bool needCorrection = !(excl & 0x1) && x+tgx != y+j && x+tgx < cSim.atoms && y+j < cSim.atoms;
+                    if (needCorrection)
+                    {   
+                        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
+
+                        dEdR        = -apos.w * psA[j].q * invR * ((1.0f-erfcAlphaR) - alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
+                        CDLJ_energy = -apos.w * psA[j].q * invR * (1.0f-erfcAlphaR);
+                    }   
+
+    #else
+                    dEdR           += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+                    /* E */
+		              CDLJ_energy    += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
+    #endif
+#else
+                    dEdR           += apos.w * psA[j].q * invR;
+                    /* E */
+		              CDLJ_energy    += apos.w * psA[j].q * invR;
+#endif
+                    dEdR           *= invR * invR;
+#ifdef USE_CUTOFF
+    #ifdef USE_EWALD
+                    if ((!(excl & 0x1) && !needCorrection) || r2 > cSim.nonbondedCutoffSqr)
+    #else
+                    if (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr)
+    #endif
+#else
+                    if (!(excl & 0x1))
+#endif
+                    {
+                        dEdR = 0.0f;
+                			/* E */
+		                  CDLJ_energy  = 0.0f;
+                    }
+		              /* E */
+                    energy         += 0.5f*CDLJ_energy;
+                    dx             *= dEdR;
+                    dy             *= dEdR;
+                    dz             *= dEdR;
+                    af.x           -= dx;
+                    af.y           -= dy;
+                    af.z           -= dz;
+                    excl          >>= 1;
+                }
+            }
+
+            // Write results
+            float4 of;
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset                          = x + tgx + warp*cSim.stride;
+            of                                  = cSim.pForce4a[offset];
+            of.x                               += af.x;
+            of.y                               += af.y;
+            of.z                               += af.z;
+            cSim.pForce4a[offset]               = of;
+#else
+            of.x                                = af.x;
+            of.y                                = af.y;
+            of.z                                = af.z;
+            of.w                                = 0.0f;
+            unsigned int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]               = of;
+#endif
+        }
+        else        // 100% utilization
+        {
+            // Read fixed atom data into registers and GRF
+            if (lasty != y)
+            {
+                unsigned int j                   = y + tgx;
+                float4 temp                      = cSim.pPosq[j];
+                float2 temp1                     = cSim.pAttr[j];
+                //float  temp3                     = cSim.pSoftCoreLJLambda[j];
+                //float  temp3                     = softCoreLJLambdaArray[j];
+                float temp3                      = feSimDev.pParticleSoftCoreLJLambda[j];
+                sA[threadIdx.x].x                = temp.x;
+                sA[threadIdx.x].y                = temp.y;
+                sA[threadIdx.x].z                = temp.z;
+                sA[threadIdx.x].q                = temp.w;
+                sA[threadIdx.x].sig              = temp1.x;
+                sA[threadIdx.x].eps              = temp1.y;
+                sA[threadIdx.x].softCoreLJLambda = temp3;
+            }
+            sA[threadIdx.x].fx      = 0.0f;
+            sA[threadIdx.x].fy      = 0.0f;
+            sA[threadIdx.x].fz      = 0.0f;
+            apos.w                 *= cSim.epsfac;
+            if (!bExclusionFlag)
+            {
+#ifdef USE_CUTOFF
+                unsigned int flags = cSim.pInteractionFlag[pos];
+                if (flags == 0)
+                {
+                    // No interactions in this block.
+                }
+                else if (flags == 0xFFFFFFFF)
+#endif
+                {
+                    // Compute all interactions within this block.
+
+                    for (unsigned int j = 0; j < GRID; j++)
+                    {
+                        dx              = psA[tj].x - apos.x;
+                        dy              = psA[tj].y - apos.y;
+                        dz              = psA[tj].z - apos.z;
+#ifdef USE_PERIODIC
+                        dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                        dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                        dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                        r2              = dx * dx + dy * dy + dz * dz;
+                        invR            = 1.0f / sqrt(r2);
+                        sig             = a.x + psA[tj].sig;
+                        eps             = a.y * psA[tj].eps;
+#ifdef USE_SOFTCORE_LJ
+                        dEdR            = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[tj].softCoreLJLambda, &CDLJ_energy );
+#else
+                        float sig2      = invR * sig;
+                        sig2           *= sig2;
+                        float sig6      = sig2 * sig2 * sig2;
+                        dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
+			               CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
+#endif
+#ifdef USE_CUTOFF
+    #ifdef USE_EWALD
+                        float r         = sqrt(r2);
+                        float alphaR    = cSim.alphaEwald * r;
+                        float erfcAlphaR = fastErfc(alphaR);
+                        dEdR           += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
+                        /* E */
+                        CDLJ_energy    += apos.w * psA[tj].q * invR * erfcAlphaR;
+    #else
+                        dEdR           += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+			/* E */
+                        CDLJ_energy    += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
+    #endif
+#else
+                        dEdR           += apos.w * psA[tj].q * invR;
+                        /* E */
+                        CDLJ_energy    += apos.w * psA[tj].q * invR;
+#endif
+                        dEdR           *= invR * invR;
+#ifdef USE_CUTOFF
+                        if (r2 > cSim.nonbondedCutoffSqr)
+                        {
+                            dEdR = 0.0f;
+			                   /* E */
+       			             CDLJ_energy = 0.0f;
+                        }
+#endif
+			               /* E */
+			               energy         += CDLJ_energy;
+                        dx             *= dEdR;
+                        dy             *= dEdR;
+                        dz             *= dEdR;
+                        af.x           -= dx;
+                        af.y           -= dy;
+                        af.z           -= dz;
+                        psA[tj].fx     += dx;
+                        psA[tj].fy     += dy;
+                        psA[tj].fz     += dz;
+                        tj              = (tj + 1) & (GRID - 1);
+                    }
+                }
+#ifdef USE_CUTOFF
+                else
+                {
+                    // Compute only a subset of the interactions in this block.
+
+                    for (unsigned int j = 0; j < GRID; j++)
+                    {
+                        if ((flags&(1<<j)) != 0)
+                        {
+                            dx              = psA[j].x - apos.x;
+                            dy              = psA[j].y - apos.y;
+                            dz              = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                            dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                            dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                            dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                            r2              = dx * dx + dy * dy + dz * dz;
+                            invR            = 1.0f / sqrt(r2);
+                            sig             = a.x + psA[j].sig;
+                            eps             = a.y * psA[j].eps;
+#ifdef USE_SOFTCORE_LJ
+                            dEdR            = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJ_energy );
+#else
+                            float sig2      = invR * sig;
+                            sig2           *= sig2;
+                            float sig6      = sig2 * sig2 * sig2;
+                            dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
+			                   /* E */
+                			    CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
+#endif
+#ifdef USE_CUTOFF
+    #ifdef USE_EWALD
+                            float r         = sqrt(r2);
+                            float alphaR    = cSim.alphaEwald * r;
+                            float erfcAlphaR = fastErfc(alphaR);
+                            dEdR           += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
+                            CDLJ_energy    += apos.w * psA[j].q * invR * erfcAlphaR;
+    #else
+                            dEdR           += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+                            /* E */
+                            CDLJ_energy    += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
+    #endif
+#else
+                            dEdR           += apos.w * psA[j].q * invR;
+                            /* E */
+                            CDLJ_energy    += apos.w * psA[j].q * invR;
+#endif
+                            dEdR           *= invR * invR;
+#ifdef USE_CUTOFF
+                            if (r2 > cSim.nonbondedCutoffSqr)
+                            {
+                                dEdR = 0.0f;
+				                    /* E */
+				                    CDLJ_energy = 0.0f;
+                            }
+#endif
+			                   /* E */
+			                   energy         += CDLJ_energy;
+                            dx             *= dEdR;
+                            dy             *= dEdR;
+                            dz             *= dEdR;
+                            af.x           -= dx;
+                            af.y           -= dy;
+                            af.z           -= dz;
+                            tempBuffer[threadIdx.x].x = dx;
+                            tempBuffer[threadIdx.x].y = dy;
+                            tempBuffer[threadIdx.x].z = dz;
+
+                            // Sum the forces on atom j.
+
+                            if (tgx % 2 == 0)
+                            {
+                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
+                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
+                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
+                            }
+                            if (tgx % 4 == 0)
+                            {
+                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
+                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
+                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
+                            }
+                            if (tgx % 8 == 0)
+                            {
+                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
+                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
+                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
+                            }
+                            if (tgx % 16 == 0)
+                            {
+                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
+                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
+                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
+                            }
+                            if (tgx == 0)
+                            {
+                                psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
+                                psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
+                                psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
+                            }
+                        }
+                    }
+                }
+#endif
+            }
+            else  // bExclusion
+            {
+                // Read fixed atom data into registers and GRF
+                unsigned int xi   = x>>GRIDBITS;
+                unsigned int yi   = y>>GRIDBITS;
+                unsigned int cell          = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
+                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
+                excl              = (excl >> tgx) | (excl << (GRID - tgx));
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    dx              = psA[tj].x - apos.x;
+                    dy              = psA[tj].y - apos.y;
+                    dz              = psA[tj].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    r2              = dx * dx + dy * dy + dz * dz;
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[tj].sig;
+                    eps             = a.y * psA[tj].eps;
+#ifdef USE_SOFTCORE_LJ
+                    dEdR            = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[tj].softCoreLJLambda, &CDLJ_energy );
+#else
+                    float sig2      = invR * sig;
+                    sig2           *= sig2;
+                    float sig6      = sig2 * sig2 * sig2;
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
+		              /* E */
+		              CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
+#endif
+
+#ifdef USE_CUTOFF
+    #ifdef USE_EWALD
+                    float r         = sqrt(r2);
+                    float alphaR    = cSim.alphaEwald * r;
+                    float erfcAlphaR = fastErfc(alphaR);
+                    dEdR           += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
+                    /* E */
+                    CDLJ_energy    += apos.w * psA[tj].q * invR * erfcAlphaR;
+                    bool needCorrection = !(excl & 0x1) && x+tgx != y+tj && x+tgx < cSim.atoms && y+tj < cSim.atoms;
+                    if (needCorrection)
+                    {
+                        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
+
+                        dEdR        = -apos.w * psA[tj].q * invR * ((1.0f-erfcAlphaR) - alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
+                        CDLJ_energy = -apos.w * psA[tj].q * invR * (1.0f-erfcAlphaR);
+                    }
+    #else
+                    dEdR           += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+                    /* E */
+	                 CDLJ_energy    += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
+    #endif
+#else
+                    dEdR           += apos.w * psA[tj].q * invR;
+                    /* E */
+                    CDLJ_energy    += apos.w * psA[tj].q * invR;
+#endif
+                    dEdR           *= invR * invR;
+#ifdef USE_CUTOFF
+    #ifdef USE_EWALD
+                    if ((!(excl & 0x1) && !needCorrection) || r2 > cSim.nonbondedCutoffSqr)
+    #else
+                    if (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr)
+    #endif
+#else
+                    if (!(excl & 0x1))
+#endif
+                    {
+                        dEdR = 0.0f;			
+                        /* E */
+	                     CDLJ_energy  = 0.0f;
+                    }
+             	    /* E */
+		              energy         += CDLJ_energy;
+                    dx             *= dEdR;
+                    dy             *= dEdR;
+                    dz             *= dEdR;
+                    af.x           -= dx;
+                    af.y           -= dy;
+                    af.z           -= dz;
+                    psA[tj].fx     += dx;
+                    psA[tj].fy     += dy;
+                    psA[tj].fz     += dz;
+                    excl          >>= 1;
+                    tj              = (tj + 1) & (GRID - 1);
+                }
+            }
+
+            // Write results
+            float4 of;
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset                          = x + tgx + warp*cSim.stride;
+            of                                  = cSim.pForce4a[offset];
+            of.x                               += af.x;
+            of.y                               += af.y;
+            of.z                               += af.z;
+            cSim.pForce4a[offset]               = of;
+            offset                              = y + tgx + warp*cSim.stride;
+            of                                  = cSim.pForce4a[offset];
+            of.x                               += sA[threadIdx.x].fx;
+            of.y                               += sA[threadIdx.x].fy;
+            of.z                               += sA[threadIdx.x].fz;
+            cSim.pForce4a[offset]               = of;
+#else
+            of.x                                = af.x;
+            of.y                                = af.y;
+            of.z                                = af.z;
+            of.w                                = 0.0f;
+            unsigned int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]               = of;
+            of.x                                = sA[threadIdx.x].fx;
+            of.y                                = sA[threadIdx.x].fy;
+            of.z                                = sA[threadIdx.x].fz;
+            offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]               = of;
+#endif
+            lasty = y;
+        }
+
+        pos++;
+    }
+    cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreBornSum.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreBornSum.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "gputypes.h"
+#include "cudaKernels.h"
+#include "GpuObcGbsaSoftcore.h"
+
+#include <cuda.h>
+#include <string>
+
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float r;
+    float sr;
+    float sum;
+    float polarScaleData;
+};
+
+struct cudaFreeEnergySimulationObcGbsaSoftcore {
+    float* pNonPolarScalingFactors;
+};
+struct cudaFreeEnergySimulationObcGbsaSoftcore gbsaSim;
+
+static __constant__ cudaGmxSimulation cSim;
+static __constant__ cudaFreeEnergySimulationObcGbsaSoftcore gbsaSimDev;
+
+extern "C"
+void SetCalculateObcGbsaSoftcoreBornSumSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "SetCalculateObcGbsaSoftcoreBornSumSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+extern "C"
+void SetCalculateObcGbsaSoftcoreNonPolarScalingFactorsSim( float* nonPolarScalingFactors )
+{
+    cudaError_t status;
+    gbsaSim.pNonPolarScalingFactors = nonPolarScalingFactors;
+    status                          = cudaMemcpyToSymbol(gbsaSimDev, &gbsaSim, sizeof(cudaFreeEnergySimulationObcGbsaSoftcore));
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateObcGbsaSoftcoreNonPolarScalingFactorsSim");
+
+    (void) fprintf( stderr, "In SetCalculateObcGbsaSoftcoreNonPolarScalingFactorsSim\n" );
+}
+
+void GetCalculateObcGbsaSoftcoreBornSumSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "GetCalculateObcGbsaSoftcoreBornSumSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+// Include versions of the kernels for N^2 calculations.
+
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateObcGbsaSoftcoreBornSum.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateObcGbsaSoftcoreBornSum.h"
+
+// Include versions of the kernels with cutoffs.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_CUTOFF
+#define METHOD_NAME(a, b) a##Cutoff##b
+#include "kCalculateObcGbsaSoftcoreBornSum.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
+#include "kCalculateObcGbsaSoftcoreBornSum.h"
+
+// Include versions of the kernels with periodic boundary conditions.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_PERIODIC
+#define METHOD_NAME(a, b) a##Periodic##b
+#include "kCalculateObcGbsaSoftcoreBornSum.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##PeriodicByWarp##b
+#include "kCalculateObcGbsaSoftcoreBornSum.h"
+
+#if 0
+__global__ void kClearObcGbsaBornSum_kernel()
+{
+    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
+    while (pos < cSim.stride * cSim.nonbondOutputBuffers)
+    {
+        ((float*)cSim.pBornSum)[pos] = 0.0f;
+        pos += gridDim.x * blockDim.x;
+    }
+}
+
+__global__ void kReduceObcGbsaBornSum_kernel()
+{
+    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
+    
+    while (pos < cSim.atoms)
+    {
+        float sum = 0.0f;
+        float* pSt = cSim.pBornSum + pos;
+        float2 atom = cSim.pObcData[pos];
+        
+        // Get summed Born data
+        for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
+        {
+            sum += *pSt;
+       //     printf("%4d %4d A: %9.4f\n", pos, i, *pSt);
+            pSt += cSim.stride;
+        }
+        
+        
+        // Now calculate Born radius and OBC term.
+        sum                    *= 0.5f * atom.x;
+        float sum2              = sum * sum;
+        float sum3              = sum * sum2;
+        float tanhSum           = tanh(cSim.alphaOBC * sum - cSim.betaOBC * sum2 + cSim.gammaOBC * sum3);
+        float nonOffsetRadii    = atom.x + cSim.dielectricOffset;
+        float bornRadius        = 1.0f / (1.0f / atom.x - tanhSum / nonOffsetRadii); 
+        float obcChain          = atom.x * (cSim.alphaOBC - 2.0f * cSim.betaOBC * sum + 3.0f * cSim.gammaOBC * sum2);
+        obcChain                = (1.0f - tanhSum * tanhSum) * obcChain / nonOffsetRadii;        
+        cSim.pBornRadii[pos] = bornRadius;
+        cSim.pObcChain[pos]  = obcChain;
+        pos += gridDim.x * blockDim.x;
+    }   
+}
+
+void kReduceObcGbsaBornSum(gpuContext gpu)
+{
+//    printf("kReduceObcGbsaBornSum\n");
+    kReduceObcGbsaBornSum_kernel<<<gpu->sim.blocks, 384>>>();
+    gpu->bRecalculateBornRadii = false;
+    LAUNCHERROR("kReduceObcGbsaBornSum");
+}
+#endif
+
+/** 
+ * Initialize parameters for Cuda Obc softcore
+ * 
+ * @param gpu                  gpu context
+ * @param innerDielectric      solute dielectric
+ * @param solventDielectric    solvent dielectric
+ * @param radius               intrinsic Born radii
+ * @param scale                Obc scaling factors
+ * @param charge               atomic charges (possibly overwritten by other methods?)
+ * @param nonPolarScalingFactors non-polar scaling factors
+ *
+ */
+
+extern "C"
+void gpuSetObcSoftcoreParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<float>& radius, const std::vector<float>& scale,
+                                 const std::vector<float>& charge, const std::vector<float>& nonPolarScalingFactors)
+{
+// ---------------------------------------------------------------------------------------
+
+   static const float dielectricOffset    =    0.009f;
+   static const float electricConstant    = -166.02691f;
+   static const std::string methodName    = "gpuSetObcSoftcoreParameters";
+
+// ---------------------------------------------------------------------------------------
+
+
+    unsigned int atoms                     = radius.size();
+
+    // initialize parameters
+
+//    gpu->bIncludeGBSA = true;
+    GpuObcGbsaSoftcore* gpuObcGbsaSoftcore = new GpuObcGbsaSoftcore();
+    gpuObcGbsaSoftcore->initializeNonPolarScalingFactors( gpu->sim.paddedNumberOfAtoms );
+    for (unsigned int i = 0; i < atoms; i++)
+    {
+            (*gpu->psObcData)[i].x = radius[i] - dielectricOffset;
+            (*gpu->psObcData)[i].y = scale[i] * (*gpu->psObcData)[i].x;
+            (*gpu->psPosq4)[i].w   = charge[i];
+            gpuObcGbsaSoftcore->setNonPolarScalingFactors( i, nonPolarScalingFactors[i] );
+
+    }
+
+    // diagnostics
+
+#if (DUMP_PARAMETERS == 1)
+    (void) fprintf( stderr, "%s %u %u\n", methodName.c_str(), gpu->natoms, gpu->sim.paddedNumberOfAtoms );
+    for (unsigned int i = 0; i < atoms; i++)
+    {
+       (void) fprintf( stderr, "%6u %13.6e %13.6e %8.3f %8.3f\n", i,  (*gpu->psObcData)[i].x, (*gpu->psObcData)[i].y, (*gpu->psPosq4)[i].w , nonPolarScalingFactors[i] );
+#endif
+
+    // dummy out extra atom data
+
+    for (unsigned int i = gpu->natoms; i < gpu->sim.paddedNumberOfAtoms; i++)
+    {
+        (*gpu->psBornRadii)[i]     = 0.2f;
+        (*gpu->psObcData)[i].x     = 0.01f;
+        (*gpu->psObcData)[i].y     = 0.01f;
+    }
+
+    // load data to board
+
+    gpuObcGbsaSoftcore->upload( gpu );
+    gpu->psBornRadii->Upload();
+    gpu->psObcData->Upload();
+    gpu->psPosq4->Upload();
+
+    gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
+}
+
+void kCalculateObcGbsaSoftcoreBornSum(gpuContext gpu)
+{
+  //  printf("kCalculateObcGbsaSoftcoreBornSum\n");
+    kClearObcGbsaBornSum( gpu );
+    LAUNCHERROR("kClearBornSum from kCalculateObcGbsaSoftcoreBornSum");
+
+    switch (gpu->sim.nonbondedMethod)
+    {
+        case NO_CUTOFF:
+#define GBSA 0
+#if GBSA == 1
+gpu->psWorkUnit->Download();
+fprintf( stderr, "kCalculateObcGbsaSoftcoreBornSum: bOutputBufferPerWarp=%u blks=%u th/blk=%u wu=%u %u\n", gpu->bOutputBufferPerWarp,
+                 gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, gpu->sim.workUnits, gpu->psWorkUnit->_pSysData[0] );
+#endif
+#undef GBSA
+
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
+            else
+                kCalculateObcGbsaN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
+            break;
+        case CUTOFF:
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            else
+                kCalculateObcGbsaCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            break;
+        case PERIODIC:
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            else
+                kCalculateObcGbsaPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            break;
+    }
+    LAUNCHERROR("kCalculateObcGbsaSoftcoreBornSum");
+}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreBornSum.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreBornSum.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This file contains the kernel for calculating Born sums.  It is included
+ * several times in kCalculateObcGbsaBornSum.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+
+__global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
+{
+    extern __shared__ Atom sA[];
+    unsigned int totalWarps   = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
+    unsigned int warp         = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    unsigned int numWorkUnits = cSim.pInteractionCount[0];
+    unsigned int pos          = warp*numWorkUnits/totalWarps;
+    unsigned int end          = (warp+1)*numWorkUnits/totalWarps;
+
+#ifdef USE_CUTOFF
+    float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
+#endif
+
+    while (pos < end)
+    {
+        // Extract cell coordinates from appropriate work unit
+        
+        //unsigned int x = workUnit[pos + (blockIdx.x*numWorkUnits)/gridDim.x];
+        unsigned int x = workUnit[pos];
+        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
+        x = (x >> 17) << GRIDBITS;
+        float       dx;
+        float       dy;
+        float       dz;
+        float       r2;
+        float       r;
+
+        unsigned int tgx = threadIdx.x & (GRID - 1);
+        unsigned int tbx = threadIdx.x - tgx;
+        unsigned int tj = tgx;
+        Atom* psA = &sA[tbx];
+
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        {
+            // Read fixed atom data into registers and GRF
+            unsigned int i = x + tgx;
+            float4 apos                             = cSim.pPosq[i];    // Local atom x, y, z, sum
+            float2 ar                               = cSim.pObcData[i];   // Local atom vr, sr
+            float polarScaleData                    = gbsaSimDev.pNonPolarScalingFactors[i];  // scale contribution
+            sA[threadIdx.x].x                       = apos.x;
+            sA[threadIdx.x].y                       = apos.y;
+            sA[threadIdx.x].z                       = apos.z;
+            sA[threadIdx.x].r                       = ar.x;
+            sA[threadIdx.x].sr                      = ar.y;
+            sA[threadIdx.x].polarScaleData          = polarScaleData;
+            apos.w                                  = 0.0f;
+
+            for (unsigned int j = 0; j < GRID; j++)
+            {
+                dx                      = psA[j].x - apos.x;
+                dy                      = psA[j].y - apos.y;
+                dz                      = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                r2                      = dx * dx + dy * dy + dz * dz;
+#if defined USE_PERIODIC
+                if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
+#elif defined USE_CUTOFF
+                if (r2 < cSim.nonbondedCutoffSqr)
+#endif
+                {
+                    r                       = sqrt(r2);
+                    float rInverse          = 1.0f / r;
+                    float rScaledRadiusJ    = r + psA[j].sr;
+                    if ((j != tgx) && (ar.x < rScaledRadiusJ))
+                    {
+                        float l_ij     = 1.0f / max(ar.x, fabs(r - psA[j].sr));
+                        float u_ij     = 1.0f / rScaledRadiusJ;
+                        float l_ij2    = l_ij * l_ij;
+                        float u_ij2    = u_ij * u_ij;
+                        float ratio    = log(u_ij / l_ij);
+                        float sum      = l_ij -
+                                         u_ij +
+                                         0.25f * r * (u_ij2 - l_ij2) +
+                                         (0.50f * rInverse * ratio) +
+                                         (0.25f * psA[j].sr * psA[j].sr * rInverse) *
+                                         (l_ij2 - u_ij2);
+                        float rj = psA[j].r;
+                        if (ar.x < (rj - r))
+                        {
+                            sum += 2.0f * ((1.0f / ar.x) - l_ij);
+                        }
+                        apos.w +=  psA[j].polarScaleData*sum;
+                    }
+                }
+            }
+
+            // Write results
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset = x + tgx + warp*cSim.stride;
+            cSim.pBornSum[offset] += apos.w;
+#else
+            unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pBornSum[offset] = apos.w;
+#endif
+        }
+        else        // 100% utilization
+        {
+            // Read fixed atom data into registers and GRF
+            unsigned int j                  = y + tgx;
+            unsigned int i                  = x + tgx;
+
+            float4 temp                     = cSim.pPosq[j];
+            float2 temp1                    = cSim.pObcData[j];
+            float polarScaleDataJ           = gbsaSimDev.pNonPolarScalingFactors[j];  // scale contribution
+            float4 apos                     = cSim.pPosq[i];        // Local atom x, y, z, sum
+            float2 ar                       = cSim.pObcData[i];    // Local atom vr, sr
+            float polarScaleDataI           = gbsaSimDev.pNonPolarScalingFactors[i];  // scale contribution
+            sA[threadIdx.x].x               = temp.x;
+            sA[threadIdx.x].y               = temp.y;
+            sA[threadIdx.x].z               = temp.z;
+            sA[threadIdx.x].r               = temp1.x;
+            sA[threadIdx.x].sr              = temp1.y;
+            sA[threadIdx.x].polarScaleData  = polarScaleDataJ;
+            sA[threadIdx.x].sum = apos.w    = 0.0f;
+
+#ifdef USE_CUTOFF
+            //unsigned int flags = cSim.pInteractionFlag[pos + (blockIdx.x*numWorkUnits)/gridDim.x];
+            unsigned int flags = cSim.pInteractionFlag[pos];
+            if (flags == 0)
+            {
+                // No interactions in this block.
+            }
+            else if (flags == 0xFFFFFFFF)
+#endif
+            {
+                // Compute all interactions within this block.
+
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    dx                      = psA[tj].x - apos.x;
+                    dy                      = psA[tj].y - apos.y;
+                    dz                      = psA[tj].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    r2                      = dx * dx + dy * dy + dz * dz;
+#ifdef USE_PERIODIC
+                    if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
+#elif defined USE_CUTOFF
+                    if (r2 < cSim.nonbondedCutoffSqr)
+#endif
+                    {
+                        r                       = sqrt(r2);
+                        float rInverse          = 1.0f / r;
+                        float rScaledRadiusJ    = r + psA[tj].sr;
+                        if (ar.x < rScaledRadiusJ)
+                        {
+                            float l_ij     = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
+                            float u_ij     = 1.0f / rScaledRadiusJ;
+                            float l_ij2    = l_ij * l_ij;
+                            float u_ij2    = u_ij * u_ij;
+                            float ratio    = log(u_ij / l_ij);
+                            float term     = l_ij -
+                                             u_ij +
+                                             0.25f * r * (u_ij2 - l_ij2) +
+                                             (0.50f * rInverse * ratio) +
+                                             (0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
+                                             (l_ij2 - u_ij2);
+                            float srj = psA[tj].sr;
+                            float scale = psA[tj].polarScaleData;
+                            if (ar.x < (srj - r))
+                            {
+                                term += 2.0f * ((1.0f / ar.x) - l_ij);
+                            }
+                            //apos.w        += term;
+                            apos.w        += (scale*term);
+                        }
+                        float rScaledRadiusI    = r + ar.y;
+                        if (psA[tj].r < rScaledRadiusI)
+                        {
+                            float l_ij     = 1.0f / max(psA[tj].r, fabs(r - ar.y));
+                            float u_ij     = 1.0f / rScaledRadiusI;
+                            float l_ij2    = l_ij * l_ij;
+                            float u_ij2    = u_ij * u_ij;
+                            float ratio    = log(u_ij / l_ij);
+                            float term     = l_ij -
+                                             u_ij +
+                                             0.25f * r * (u_ij2 - l_ij2) +
+                                             (0.50f * rInverse * ratio) +
+                                             (0.25f * ar.y * ar.y * rInverse) *
+                                             (l_ij2 - u_ij2);
+                            float rj = psA[tj].r;
+                            if (rj < (ar.y - r))
+                            {
+                                term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
+                            }
+                            psA[tj].sum    += polarScaleDataI*term;
+                        }
+                    }
+                    tj = (tj - 1) & (GRID - 1);
+                }
+            }
+#ifdef USE_CUTOFF
+            else
+            {
+                // Compute only a subset of the interactions in this block.
+
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    if ((flags&(1<<j)) != 0)
+                    {
+                        tempBuffer[threadIdx.x] = 0.0f;
+                        dx                      = psA[j].x - apos.x;
+                        dy                      = psA[j].y - apos.y;
+                        dz                      = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                        dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                        dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                        dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                        r2                      = dx * dx + dy * dy + dz * dz;
+#ifdef USE_PERIODIC
+                        if (i < cSim.atoms && y+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
+#elif defined USE_CUTOFF
+                        if (r2 < cSim.nonbondedCutoffSqr)
+#endif
+                        {
+                            r                       = sqrt(r2);
+                            float rInverse          = 1.0f / r;
+                            float rScaledRadiusJ    = r + psA[j].sr;
+                            if (ar.x < rScaledRadiusJ)
+                            {
+                                float l_ij     = 1.0f / max(ar.x, fabs(r - psA[j].sr));
+                                float u_ij     = 1.0f / rScaledRadiusJ;
+                                float l_ij2    = l_ij * l_ij;
+                                float u_ij2    = u_ij * u_ij;
+                                float ratio    = log(u_ij / l_ij);
+                                float term     = l_ij -
+                                                 u_ij +
+                                                 0.25f * r * (u_ij2 - l_ij2) +
+                                                 (0.50f * rInverse * ratio) +
+                                                 (0.25f * psA[j].sr * psA[j].sr * rInverse) *
+                                                 (l_ij2 - u_ij2);
+                                float srj = psA[j].sr;
+                                if (ar.x < (srj - r))
+                                {
+                                    term += 2.0f * ((1.0f / ar.x) - l_ij);
+                                }
+                                apos.w        += psA[j].polarScaleData*term;
+                            }
+                            float rScaledRadiusI    = r + ar.y;
+                            if (psA[j].r < rScaledRadiusI)
+                            {
+                                float l_ij     = 1.0f / max(psA[j].r, fabs(r - ar.y));
+                                float u_ij     = 1.0f / rScaledRadiusI;
+                                float l_ij2    = l_ij * l_ij;
+                                float u_ij2    = u_ij * u_ij;
+                                float ratio    = log(u_ij / l_ij);
+                                float term     = l_ij -
+                                                 u_ij +
+                                                 0.25f * r * (u_ij2 - l_ij2) +
+                                                 (0.50f * rInverse * ratio) +
+                                                 (0.25f * ar.y * ar.y * rInverse) *
+                                                 (l_ij2 - u_ij2);
+                                float rj = psA[j].r;
+                                if (rj < (ar.y - r))
+                                {
+                                    term += 2.0f * ((1.0f / psA[j].r) - l_ij);
+                                }
+                                tempBuffer[threadIdx.x] = polarScaleDataI*term;
+                            }
+                        }
+
+                        // Sum the terms.
+
+                        if (tgx % 2 == 0)
+                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
+                        if (tgx % 4 == 0)
+                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
+                        if (tgx % 8 == 0)
+                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
+                        if (tgx % 16 == 0)
+                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
+                        if (tgx == 0)
+                            psA[j].sum += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
+                    }
+                }
+            }
+#endif
+
+            // Write results
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset = x + tgx + warp*cSim.stride;
+            cSim.pBornSum[offset] += apos.w;
+            offset = y + tgx + warp*cSim.stride;
+            cSim.pBornSum[offset] += sA[threadIdx.x].sum;
+#else
+            unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
+            cSim.pBornSum[offset] = apos.w;
+            offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pBornSum[offset] = sA[threadIdx.x].sum;
+#endif
+        }
+
+        pos++;
+    }
+}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreForces2.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreForces2.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+#include "gputypes.h"
+#include "cudaKernels.h"
+
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float r;
+    float sr;
+    float fx;
+    float fy;
+    float fz;
+    float fb;
+};
+
+
+static __constant__ cudaGmxSimulation cSim;
+
+extern "C"
+void SetCalculateObcGbsaSoftcoreForces2Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetCalculateObcGbsaSoftcoreForces2Sim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+// Include versions of the kernels for N^2 calculations.
+
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateObcGbsaSoftcoreForces2.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateObcGbsaSoftcoreForces2.h"
+
+// Include versions of the kernels with cutoffs.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_CUTOFF
+#define METHOD_NAME(a, b) a##Cutoff##b
+#include "kCalculateObcGbsaSoftcoreForces2.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
+#include "kCalculateObcGbsaSoftcoreForces2.h"
+
+// Include versions of the kernels with periodic boundary conditions.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_PERIODIC
+#define METHOD_NAME(a, b) a##Periodic##b
+#include "kCalculateObcGbsaSoftcoreForces2.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##PeriodicByWarp##b
+#include "kCalculateObcGbsaSoftcoreForces2.h"
+
+void kCalculateObcGbsaSoftcoreForces2(gpuContext gpu)
+{
+    //printf("kCalculateObcGbsaSoftcoreForces2\n");
+    //fprintf( stderr, "kCalculateObcGbsaSoftcoreForces2 nonbondedMethod=%d warp=%d\n", gpu->sim.nonbondedMethod, gpu->bOutputBufferPerWarp);
+//fprintf( stderr, "kCalculateObcGbsaSoftcoreForces2 nonbondedMethod=%d calling kReduceForces\n", gpu->sim.nonbondedMethod);
+//kReduceForces(gpu);
+    switch (gpu->sim.nonbondedMethod)
+    {
+        case NO_CUTOFF:
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaSoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
+            else
+                kCalculateObcGbsaSoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
+            break;
+        case CUTOFF:
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaSoftcoreCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            else
+                kCalculateObcGbsaSoftcoreCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            break;
+        case PERIODIC:
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaSoftcorePeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            else
+                kCalculateObcGbsaSoftcorePeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            break;
+    }
+    LAUNCHERROR("kCalculateObcGbsaSoftcoreForces2");
+}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreForces2.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreForces2.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This file contains the kernel for evalauating the second stage of GBSA.  It is included
+ * several times in kCalculateObcGbsaSoftcoreForces2.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+
+__global__ void METHOD_NAME(kCalculateObcGbsaSoftcore, Forces2_kernel)(unsigned int* workUnit)
+{
+    extern __shared__ Atom sA[];
+    unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    unsigned int numWorkUnits = cSim.pInteractionCount[0];
+    unsigned int pos = warp*numWorkUnits/totalWarps;
+    unsigned int end = (warp+1)*numWorkUnits/totalWarps;
+#ifdef USE_CUTOFF
+    float3* tempBuffer = (float3*) &sA[cSim.bornForce2_threads_per_block];
+#endif
+
+    unsigned int lasty = -0xFFFFFFFF;
+    while (pos < end)
+    {
+
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x                  = workUnit[pos];
+        unsigned int y                  = ((x >> 2) & 0x7fff) << GRIDBITS;
+        x                               = (x >> 17) << GRIDBITS;
+        unsigned int tgx                = threadIdx.x & (GRID - 1);
+        unsigned int i                  = x + tgx;
+        float4 apos                     = cSim.pPosq[i];
+        float2 a                        = cSim.pObcData[i];
+        float fb                        = cSim.pBornForce[i];
+        unsigned int tbx                = threadIdx.x - tgx;
+        unsigned int tj                 = tgx;
+        Atom* psA                       = &sA[tbx];
+        float3 af;
+        sA[threadIdx.x].fx = af.x   = 0.0f;
+        sA[threadIdx.x].fy = af.y   = 0.0f;
+        sA[threadIdx.x].fz = af.z   = 0.0f;
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        {
+            // Read fixed atom data into registers and GRF
+
+            sA[threadIdx.x].x           = apos.x;
+            sA[threadIdx.x].y           = apos.y;
+            sA[threadIdx.x].z           = apos.z;
+            sA[threadIdx.x].r           = a.x;
+            sA[threadIdx.x].sr          = a.y;
+            sA[threadIdx.x].fb          = fb;
+
+            for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1))
+            {
+                float dx                = psA[j].x - apos.x;
+                float dy                = psA[j].y - apos.y;
+                float dz                = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                float r2                = dx * dx + dy * dy + dz * dz;
+                float r                 = sqrt(r2);
+
+
+                // Atom I Born forces and sum
+                float rScaledRadiusJ    = r + psA[j].sr;
+
+                float l_ij          = 1.0f / max(a.x, fabs(r - psA[j].sr));
+                float u_ij          = 1.0f / rScaledRadiusJ;
+                float rInverse      = 1.0f / r;
+                float l_ij2         = l_ij * l_ij;
+                float u_ij2         = u_ij * u_ij;
+                float r2Inverse     = rInverse * rInverse;
+                float t1            = log (u_ij / l_ij);
+                float t2            = (l_ij2 - u_ij2);
+                float t3            = t2 * rInverse;
+                t1                 *= rInverse;
+
+                // Born Forces term
+                float term          =  0.125f *
+                                      (1.000f + psA[j].sr * psA[j].sr * r2Inverse) * t3 +
+                                       0.250f * t1 * r2Inverse;
+                float dE            = fb * term;
+
+#if defined USE_PERIODIC
+                if (a.x >= rScaledRadiusJ || i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+#elif defined USE_CUTOFF
+                if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
+#else
+                if (a.x >= rScaledRadiusJ)
+#endif
+                {
+                    dE              = 0.0f;
+                }
+                float d             = dx * dE;
+                af.x               -= d;
+                psA[j].fx          += d;
+                d                   = dy * dE;
+                af.y               -= d;
+                psA[j].fy          += d;
+                d                   = dz * dE;
+                af.z               -= d;
+                psA[j].fz          += d;
+            }
+
+            // Write results
+            float4 of;
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset         = x + tgx + warp*cSim.stride;
+#else
+            unsigned int offset         = x + tgx + (x >> GRIDBITS) * cSim.stride;
+#endif
+            of                          = cSim.pForce4b[offset];
+            of.x                       += af.x + sA[threadIdx.x].fx;
+            of.y                       += af.y + sA[threadIdx.x].fy;
+            of.z                       += af.z + sA[threadIdx.x].fz;
+            of.w                        = 0.0f;
+            cSim.pForce4b[offset]       = of;
+        }
+        else
+        {
+            // Read fixed atom data into registers and GRF
+            if (lasty != y)
+            {
+                unsigned int j              = y + tgx;
+                float4 temp                 = cSim.pPosq[j];
+                float2 temp1                = cSim.pObcData[j];
+                sA[threadIdx.x].fb          = cSim.pBornForce[j];
+                sA[threadIdx.x].x           = temp.x;
+                sA[threadIdx.x].y           = temp.y;
+                sA[threadIdx.x].z           = temp.z;
+                sA[threadIdx.x].r           = temp1.x;
+                sA[threadIdx.x].sr          = temp1.y;
+            }
+            float sr2                   = a.y * a.y;
+#ifdef USE_CUTOFF
+            unsigned int flags = cSim.pInteractionFlag[pos];
+            if (flags == 0)
+            {
+                // No interactions in this block.
+            }
+            else if (flags == 0xFFFFFFFF)
+#endif
+            {
+                // Compute all interactions within this block.
+
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    float dx                = psA[tj].x - apos.x;
+                    float dy                = psA[tj].y - apos.y;
+                    float dz                = psA[tj].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    float r2                = dx * dx + dy * dy + dz * dz;
+                    float r                 = sqrt(r2);
+
+                    // Interleaved Atom I and J Born Forces and sum components
+                    float r2Inverse         = 1.0f / r2;
+                    float rScaledRadiusJ    = r + psA[tj].sr;
+                    float rScaledRadiusI    = r + a.y;
+                    float rInverse          = 1.0f / r;
+                    float l_ijJ             = 1.0f / max(a.x, fabs(r - psA[tj].sr));
+                    float l_ijI             = 1.0f / max(psA[tj].r, fabs(r - a.y));
+                    float u_ijJ             = 1.0f / rScaledRadiusJ;
+                    float u_ijI             = 1.0f / rScaledRadiusI;
+                    float l_ij2J            = l_ijJ * l_ijJ;
+                    float l_ij2I            = l_ijI * l_ijI;
+                    float u_ij2J            = u_ijJ * u_ijJ;
+                    float u_ij2I            = u_ijI * u_ijI;
+                    float t1J               = log (u_ijJ / l_ijJ);
+                    float t1I               = log (u_ijI / l_ijI);
+                    float t2J               = (l_ij2J - u_ij2J);
+                    float t2I               = (l_ij2I - u_ij2I);
+                    float t3J               = t2J * rInverse;
+                    float t3I               = t2I * rInverse;
+                    t1J                    *= rInverse;
+                    t1I                    *= rInverse;
+
+                    // Born Forces term
+                    float term              =  0.125f *
+                                              (1.000f + psA[tj].sr * psA[tj].sr * r2Inverse) * t3J +
+                                               0.250f * t1J * r2Inverse;
+                    float dE                = fb * term;
+
+#if defined USE_PERIODIC
+                    if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+#elif defined USE_CUTOFF
+                    if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
+#else
+                    if (a.x >= rScaledRadiusJ)
+#endif
+                    {
+                        dE                  = 0.0f;
+                    }
+
+                    float d                 = dx * dE;
+                    af.x                   -= d;
+                    psA[tj].fx             += d;
+                    d                       = dy * dE;
+                    af.y                   -= d;
+                    psA[tj].fy             += d;
+                    d                       = dz * dE;
+                    af.z                   -= d;
+                    psA[tj].fz             += d;
+
+                    // Atom J Born sum term
+                    term                    =  0.125f *
+                                              (1.000f + sr2 * r2Inverse) * t3I +
+                                               0.250f * t1I * r2Inverse;
+                    dE                      = psA[tj].fb * term;
+
+                    float rj = psA[tj].r;
+#ifdef USE_PERIODIC
+                    if (rj >= rScaledRadiusI || i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+#elif defined USE_CUTOFF
+                    if (rj >= rScaledRadiusI || r2 > cSim.nonbondedCutoffSqr)
+#else
+                    if (rj >= rScaledRadiusI)
+#endif
+                    {
+                        dE                  = 0.0f;
+                    }
+                    dx                     *= dE;
+                    dy                     *= dE;
+                    dz                     *= dE;
+                    psA[tj].fx             += dx;
+                    psA[tj].fy             += dy;
+                    psA[tj].fz             += dz;
+                    af.x                   -= dx;
+                    af.y                   -= dy;
+                    af.z                   -= dz;
+                    tj                      = (tj + 1) & (GRID - 1);
+                }
+            }
+#ifdef USE_CUTOFF
+            else
+            {
+                // Compute only a subset of the interactions in this block.
+
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    if ((flags&(1<<j)) != 0)
+                    {
+                        float dx                = psA[j].x - apos.x;
+                        float dy                = psA[j].y - apos.y;
+                        float dz                = psA[j].z - apos.z;
+    #ifdef USE_PERIODIC
+                        dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                        dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                        dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+    #endif
+                        float r2                = dx * dx + dy * dy + dz * dz;
+                        float r                 = sqrt(r2);
+
+                        // Interleaved Atom I and J Born Forces and sum components
+                        float r2Inverse         = 1.0f / r2;
+                        float rScaledRadiusJ    = r + psA[j].sr;
+                        float rScaledRadiusI    = r + a.y;
+                        float rInverse          = 1.0f / r;
+                        float l_ijJ             = 1.0f / max(a.x, fabs(r - psA[j].sr));
+                        float l_ijI             = 1.0f / max(psA[j].r, fabs(r - a.y));
+                        float u_ijJ             = 1.0f / rScaledRadiusJ;
+                        float u_ijI             = 1.0f / rScaledRadiusI;
+                        float l_ij2J            = l_ijJ * l_ijJ;
+                        float l_ij2I            = l_ijI * l_ijI;
+                        float u_ij2J            = u_ijJ * u_ijJ;
+                        float u_ij2I            = u_ijI * u_ijI;
+                        float t1J               = log (u_ijJ / l_ijJ);
+                        float t1I               = log (u_ijI / l_ijI);
+                        float t2J               = (l_ij2J - u_ij2J);
+                        float t2I               = (l_ij2I - u_ij2I);
+                        float t3J               = t2J * rInverse;
+                        float t3I               = t2I * rInverse;
+                        t1J                    *= rInverse;
+                        t1I                    *= rInverse;
+
+                        // Born Forces term
+                        float term              =  0.125f *
+                                                  (1.000f + psA[j].sr * psA[j].sr * r2Inverse) * t3J +
+                                                   0.250f * t1J * r2Inverse;
+                        float dE                = fb * term;
+
+    #if defined USE_PERIODIC
+                        if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+    #elif defined USE_CUTOFF
+                        if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
+    #else
+                        if (a.x >= rScaledRadiusJ)
+    #endif
+                        {
+                            dE                  = 0.0f;
+                        }
+
+                        float d                 = dx * dE;
+                        af.x                   -= d;
+                        tempBuffer[threadIdx.x].x = d;
+                        d                       = dy * dE;
+                        af.y                   -= d;
+                        tempBuffer[threadIdx.x].y = d;
+                        d                       = dz * dE;
+                        af.z                   -= d;
+                        tempBuffer[threadIdx.x].z = d;
+
+                        // Atom J Born sum term
+                        term                    =  0.125f *
+                                                  (1.000f + sr2 * r2Inverse) * t3I +
+                                                   0.250f * t1I * r2Inverse;
+                        dE                      = psA[j].fb * term;
+
+                        float rj = psA[j].r;
+    #ifdef USE_PERIODIC
+                        if (rj >= rScaledRadiusI || i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+    #elif defined USE_CUTOFF
+                        if (rj >= rScaledRadiusI || r2 > cSim.nonbondedCutoffSqr)
+    #else
+                        if (rj >= rScaledRadiusI)
+    #endif
+                        {
+                            dE                  = 0.0f;
+                        }
+                        dx                     *= dE;
+                        dy                     *= dE;
+                        dz                     *= dE;
+                        tempBuffer[threadIdx.x].x += dx;
+                        tempBuffer[threadIdx.x].y += dy;
+                        tempBuffer[threadIdx.x].z += dz;
+                        af.x                   -= dx;
+                        af.y                   -= dy;
+                        af.z                   -= dz;
+
+                        // Sum the forces on atom j.
+
+                        if (tgx % 2 == 0)
+                        {
+                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
+                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
+                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
+                        }
+                        if (tgx % 4 == 0)
+                        {
+                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
+                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
+                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
+                        }
+                        if (tgx % 8 == 0)
+                        {
+                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
+                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
+                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
+                        }
+                        if (tgx % 16 == 0)
+                        {
+                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
+                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
+                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
+                        }
+                        if (tgx == 0)
+                        {
+                            psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
+                            psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
+                            psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
+                        }
+                    }
+                }
+            }
+#endif
+
+            // Write results
+            float4 of;
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset         = x + tgx + warp*cSim.stride;
+#else
+            unsigned int offset         = x + tgx + (y >> GRIDBITS) * cSim.stride;
+#endif
+            of                          = cSim.pForce4b[offset];
+            of.x                       += af.x;
+            of.y                       += af.y;
+            of.z                       += af.z;
+            cSim.pForce4b[offset]       = of;
+
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            offset                      = y + tgx + warp*cSim.stride;
+#else
+            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
+#endif
+            of                          = cSim.pForce4b[offset];
+            of.x                       += sA[threadIdx.x].fx;
+            of.y                       += sA[threadIdx.x].fy;
+            of.z                       += sA[threadIdx.x].fz;
+            cSim.pForce4b[offset]       = of;
+        }
+        lasty = y;
+        pos++;
+    }
+}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kSoftcoreLJ.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kSoftcoreLJ.h
+#ifndef _K_SOFTCORE_LJ__H__
+#define _K_SOFTCORE_LJ__H__
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This file contains kernel for calculating softcore LJ force prefactor
+ */
+
+#ifdef USE_SOFTCORE_LJ
+__device__ float getSoftCoreLJ( float r2, float sig, float  eps, float lambdaI, float lambdaJ, float* energy)
+{
+
+   float r                         = sqrt(r2);
+   float lambda                    = lambdaI < lambdaJ ? lambdaI : lambdaJ;
+   eps                            *= lambda;
+
+
+    // (r/sig)
+    float sig2                     = r/sig;
+          sig2                    *= sig2;
+    float sig6                     = sig2*sig2*sig2;
+
+    float softcoreLJTerm           = 0.5f*( 1.0f -  lambda) + sig6;
+    float softcoreLJInv            = 1.0f/softcoreLJTerm;
+    float softcoreLJInv2           = softcoreLJInv*softcoreLJInv;
+    *energy                        = eps*(softcoreLJInv2 - softcoreLJInv);
+
+    return eps*softcoreLJInv2*( 12.0f*softcoreLJInv - 6.0f )*sig6;
+    
+}
+#endif
+
+#endif
--- a/plugins/freeEnergy/platforms/cuda/staticTarget/CMakeLists.txt
+++ b/plugins/freeEnergy/platforms/cuda/staticTarget/CMakeLists.txt
+#
+# Include CUDA related files.
+#
+SET(OPENMM_BUILD_FREE_ENERGY_PATH ${CMAKE_SOURCE_DIR}/plugins/freeEnergy)
+
+SET(CUDA_NVCC_BUILD_FLAGS "-DCUDPP_STATIC_LIB" )
+INCLUDE(${FINDCUDA_DIR}/FindCuda.cmake)
+INCLUDE_DIRECTORIES(${CUDA_INCLUDE})
+LINK_DIRECTORIES(${CUDA_TARGET_LINK})
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    FILE(GLOB src_files ${OPENMM_BUILD_FREE_ENERGY_PATH}/platforms/cuda/${subdir}/src/*.cu  ${OPENMM_BUILD_FREE_ENERGY_PATH}/platforms/cuda/${subdir}/src/*/*.cu)
+    FOREACH(file ${src_files})
+        FILE(RELATIVE_PATH file ${OPENMM_BUILD_FREE_ENERGY_PATH}/platforms/cuda ${file})
+        SET(SOURCE_FILES         ${SOURCE_FILES}         ${file})   #append
+    ENDFOREACH(file)
+    CUDA_INCLUDE_DIRECTORIES(BEFORE ${OPENMM_BUILD_FREE_ENERGY_PATH}/platforms/cuda/${subdir}/include)
+    CUDA_INCLUDE_DIRECTORIES(BEFORE ${OPENMM_BUILD_FREE_ENERGY_PATH}/platforms/cuda/${subdir}/src)
+ENDFOREACH(subdir)
+#CUDA_INCLUDE_DIRECTORIES(BEFORE ${OPENMM_BUILD_FREE_ENERGY_PATH}/jama/include)
+CUDA_INCLUDE_DIRECTORIES(${OPENMM_BUILD_FREE_ENERGY_PATH}/platforms/cuda/../src
+                         ${OPENMM_DIR}/platforms/cuda/src
+                         ${OPENMM_DIR}/platforms/cuda/src/kernels
+                         ${OPENMM_DIR}/openmmapi/include ) 
+
+CUDA_ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
+
+# required for getting OPENMM_EXPORT to be set correctly in 'class OPENMM_EXPORT CudaStreamFactory', ...
+# see OpenMM/openmmapi/include/internal/windowsExport.h for details
+
+SET(CUDA_STATIC_COMPILE_FLAG "-DOPENMM_BUILDING_STATIC_LIBRARY -DOPENMM_USE_STATIC_LIBRARIES -DCUDPP_STATIC_LIB")
+SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS ${CUDA_STATIC_COMPILE_FLAG})
+
+TARGET_LINK_LIBRARIES(${STATIC_TARGET} optimized ${OPENMM_LIBRARY_NAME}_static)
+TARGET_LINK_LIBRARIES(${STATIC_TARGET} debug ${OPENMM_LIBRARY_NAME}_static_d optimized ${OPENMM_LIBRARY_NAME}_static ${CUFFT_TARGET_LINK})
+
+INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${STATIC_TARGET})
--- a/plugins/freeEnergy/platforms/cuda/tests/CMakeLists.txt
+++ b/plugins/freeEnergy/platforms/cuda/tests/CMakeLists.txt
+#
+# Testing
+#
+
+ENABLE_TESTING()
+
+INCLUDE(${CMAKE_SOURCE_DIR}/platforms/cuda/cuda-cmake/FindCuda.cmake)
+INCLUDE_DIRECTORIES(${CUDA_INCLUDE})
+INCLUDE_DIRECTORIES(${OPENMM_DIR}/platforms/cuda/include)
+INCLUDE_DIRECTORIES(${OPENMM_DIR}/platforms/cuda/src)
+INCLUDE_DIRECTORIES(${OPENMM_DIR}/platforms/cuda/src/kernels)
+Set( SHARED_OPENMM_TARGET OpenMMFreeEnergy)
+Set( SHARED_CUDA_TARGET OpenMMCuda)
+Set( STATIC_CUDA_TARGET OpenMMCuda_static)
+IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(SHARED_CUDA_TARGET   ${SHARED_CUDA_TARGET}_d)
+    SET(SHARED_OPENMM_TARGET ${SHARED_OPENMM_TARGET}_d)
+    SET(STATIC_CUDA_TARGET ${STATIC_CUDA_TARGET}_d)
+    SET(STATIC_CUDA_TARGET ${STATIC_CUDA_TARGET}_d)
+ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+#LINK_DIRECTORIES
+
+
+# Automatically create tests using files named "Test*.cpp"
+FILE(GLOB TEST_PROGS "*Test*.cpp")
+FOREACH(TEST_PROG ${TEST_PROGS})
+    GET_FILENAME_COMPONENT(TEST_ROOT ${TEST_PROG} NAME_WE)
+
+    # Link with shared library
+    CUDA_ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
+    TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_TARGET} ${SHARED_OPENMM_TARGET} ${SHARED_CUDA_TARGET})
+    ADD_TEST(${TEST_ROOT} ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT})
+
+
+
+    # Link with static library
+#     SET(TEST_STATIC ${TEST_ROOT}Static)
+#     CUDA_ADD_EXECUTABLE(${TEST_STATIC} ${TEST_PROG})
+#     SET_TARGET_PROPERTIES(${TEST_STATIC}
+#                 PROPERTIES
+#                 COMPILE_FLAGS "-DOPENMM_USE_STATIC_LIBRARIES"
+#                 )
+#     TARGET_LINK_LIBRARIES(${TEST_STATIC} ${STATIC_TARGET})
+#     ADD_TEST(${TEST_STATIC} ${EXECUTABLE_OUTPUT_PATH}/${TEST_STATIC})
+
+ENDFOREACH(TEST_PROG ${TEST_PROGS})
+
+# TestCudaUsingParameterFile customized w/ command-line argument (input file name used in test) 
+
+#ADD_EXECUTABLE(TestCudaUsingParameterFile TstCudaUsingParameterFile.cpp)
+#TARGET_LINK_LIBRARIES(TestCudaUsingParameterFile ${SHARED_TARGET})
+#ADD_TEST(TestCudaUsingParameterFile "${EXECUTABLE_OUTPUT_PATH}/TestCudaUsingParameterFile" "-parameterFileName" "${CMAKE_CURRENT_SOURCE_DIR}/lambdaSdObcParameters.txt")
+#ADD_TEST(TestCudaUsingParameterFile "${EXECUTABLE_OUTPUT_PATH}/TestCudaUsingParameterFile" "-parameterFileName" "${CMAKE_CURRENT_SOURCE_DIR}/bptiMdRfNoPbcParameters.txt")
+#
+#SET(TEST_ROOT TestCudaUsingParameterFile)
+#SET(TEST_PROG TstCudaUsingParameterFile.cpp)
+#SET(TEST_STATIC ${TEST_ROOT}Static)
+#SET(INCLUDE_CUDA_STATIC 1)
+#IF(INCLUDE_CUDA_STATIC)
+#   ADD_EXECUTABLE(${TEST_STATIC} ${TEST_PROG})
+#   SET_TARGET_PROPERTIES(${TEST_STATIC}
+#                         PROPERTIES
+#                         COMPILE_FLAGS "-DOPENMM_USE_STATIC_LIBRARIES"
+#                        )
+#   TARGET_LINK_LIBRARIES(${TEST_STATIC} ${STATIC_TARGET} ${STATIC_BROOK_TARGET})
+#   ADD_TEST(${TEST_STATIC} "${EXECUTABLE_OUTPUT_PATH}/TestCudaUsingParameterFileStatic" "-parameterFileName" "${CMAKE_CURRENT_SOURCE_DIR}/lambdaSdObcParameters.txt")
+#   ADD_TEST(${TEST_STATIC} "${EXECUTABLE_OUTPUT_PATH}/TestCudaUsingParameterFileStatic" "-parameterFileName" "${CMAKE_CURRENT_SOURCE_DIR}/bptiMdRfNoPbcParameters.txt")
+#  ADD_TEST(${TEST_STATIC} "${EXECUTABLE_OUTPUT_PATH}/TestCudaUsingParameterFileStatic" "-parameterFileName" "${CMAKE_CURRENT_SOURCE_DIR}/bptiMdRfPbcParameters.txt" " +checkEnergyForceConsistent -checkForces" )
+#ENDIF(INCLUDE_CUDA_STATIC)
+
+
--- a/plugins/freeEnergy/platforms/cuda/tests/TestCudaGBVISoftcoreForce.cpp
+++ b/plugins/freeEnergy/platforms/cuda/tests/TestCudaGBVISoftcoreForce.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the reference implementation of GBVIForce.
+ */
+
+#include "../../../tests/AssertionUtilities.h"
+#include "openmm/Context.h"
+#include "CudaPlatform.h"
+#include "ReferencePlatform.h"
+#include "openmm/GBVISoftcoreForce.h"
+#include "openmm/GBSAOBCForce.h"
+#include "openmm/System.h"
+#include "openmm/LangevinIntegrator.h"
+#include "openmm/NonbondedForce.h"
+#include "openmm/NonbondedSoftcoreForce.h"
+#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
+#include "../src/sfmt/SFMT.h"
+#include "OpenMMFreeEnergy.h"
+#include "openmm/freeEnergyKernels.h"
+#include "ReferenceFreeEnergyKernelFactory.h"
+#include "CudaFreeEnergyKernelFactory.h"
+
+#include <iostream>
+#include <vector>
+
+using namespace OpenMM;
+using namespace std;
+
+const double TOL = 1e-5;
+
+#define PRINT_ON 1
+
+int compareForcesOfTwoStates( int numParticles, State& state1, State& state2, double relativeTolerance, double absoluteTolerance ) {
+
+    int error = 0;
+    for (int i = 0; i < numParticles; ++i) {
+        Vec3 f1       = state1.getForces()[i];
+        Vec3 f2       = state2.getForces()[i];
+        double diff   = (f1[0] - f2[0])*(f1[0] - f2[0]) +
+                        (f1[1] - f2[1])*(f1[1] - f2[1]) +
+                        (f1[2] - f2[2])*(f1[2] - f2[2]); 
+        double denom1 = fabs( f1[0] ) + fabs( f1[1] ) +fabs( f1[2] );
+        double denom2 = fabs( f2[0] ) + fabs( f2[1] ) +fabs( f2[2] );
+        int        ok = 1;
+        if( (denom1 > 0.0 || denom2 > 0.0) && (sqrt( diff )/(denom1+denom2)) > relativeTolerance ){
+           error++;
+           ok = 0;
+        }
+#if PRINT_ON == 1
+        (void) fprintf( stderr, "F %d [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e] %s\n", i, 
+                        f1[0], f1[1], f1[2], f2[0], f2[1], f2[2], (ok ? "":"XXXXXX") );
+#endif
+    }
+
+    return error;
+}
+
+void testSingleParticle() {
+    CudaPlatform platform;
+    System system;
+    system.addParticle(2.0);
+    LangevinIntegrator integrator(0, 0.1, 0.01);
+
+    GBVISoftcoreForce* forceField = new GBVISoftcoreForce;
+
+    double charge         = 1.0;
+    double radius         = 0.15;
+    double gamma          = 1.0;
+    forceField->addParticle(charge, radius, gamma);
+    system.addForce(forceField);
+
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(1);
+    positions[0] = Vec3(0, 0, 0);
+    context.setPositions(positions);
+    State state = context.getState(State::Energy);
+
+    double bornRadius     = radius; 
+    double eps0           = EPSILON0;
+    double tau            = (1.0/forceField->getSoluteDielectric()-1.0/forceField->getSolventDielectric());
+
+    double bornEnergy     = (-charge*charge/(8*PI_M*eps0))*tau/bornRadius;
+    double nonpolarEnergy = -0.1*CAL2JOULE*gamma*tau*std::pow( radius/bornRadius, 3.0);
+
+    double expectedE      = (bornEnergy+nonpolarEnergy); 
+    double obtainedE      = state.getPotentialEnergy(); 
+    double diff           = fabs( obtainedE - expectedE );
+#if PRINT_ON == 1
+    (void) fprintf( stderr, "testSingleParticle expected=%14.6e obtained=%14.6e diff=%14.6e breakdown:[%14.6e %14.6e]\n",
+                    expectedE, obtainedE, diff, bornEnergy, nonpolarEnergy );
+#endif
+    ASSERT_EQUAL_TOL((bornEnergy+nonpolarEnergy), state.getPotentialEnergy(), 0.01);
+}
+
+void testCutoffAndPeriodic() {
+
+    CudaPlatform cuda;
+
+    System system;
+    system.addParticle(1.0);
+    system.addParticle(1.0);
+
+    LangevinIntegrator integrator(0, 0.1, 0.01);
+
+    GBVISoftcoreForce* gbsa    = new GBVISoftcoreForce();
+    NonbondedForce* nonbonded  = new NonbondedForce();
+
+    gbsa->addParticle(-1, 0.15, 1.0);
+    nonbonded->addParticle(-1, 1, 0);
+
+    gbsa->addParticle(1, 0.15, 1.0);
+    nonbonded->addParticle(1, 1, 0);
+
+    const double cutoffDistance = 3.0;
+    const double boxSize = 10.0;
+
+    nonbonded->setCutoffDistance(cutoffDistance);
+    system.setPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    system.addForce(gbsa);
+    system.addForce(nonbonded);
+    vector<Vec3> positions(2);
+    positions[0] = Vec3(0, 0, 0);
+    positions[1] = Vec3(2, 0, 0);
+
+    // Calculate the forces for both cutoff and periodic with two different atom positions.
+
+    nonbonded->setNonbondedMethod(NonbondedForce::CutoffNonPeriodic);
+    Context context(system, integrator, cuda);
+    context.setPositions(positions);
+    State state1 = context.getState(State::Forces);
+    nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
+    context.reinitialize();
+    context.setPositions(positions);
+    State state2 = context.getState(State::Forces);
+    positions[1][0]+= boxSize;
+    nonbonded->setNonbondedMethod(NonbondedForce::CutoffNonPeriodic);
+    context.reinitialize();
+    context.setPositions(positions);
+    State state3 = context.getState(State::Forces);
+    nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
+    context.reinitialize();
+    context.setPositions(positions);
+    State state4 = context.getState(State::Forces);
+
+    // All forces should be identical, exception state3 which should be zero.
+
+#if PRINT_ON == 1
+    (void) fprintf( stderr, "testCutoffAndPeriodic\n" );
+#endif
+    ASSERT_EQUAL_VEC(state1.getForces()[0], state2.getForces()[0], 0.01);
+    ASSERT_EQUAL_VEC(state1.getForces()[1], state2.getForces()[1], 0.01);
+    ASSERT_EQUAL_VEC(state1.getForces()[0], state4.getForces()[0], 0.01);
+    ASSERT_EQUAL_VEC(state1.getForces()[1], state4.getForces()[1], 0.01);
+    ASSERT_EQUAL_VEC(state3.getForces()[0], Vec3(0, 0, 0), 0.01);
+    ASSERT_EQUAL_VEC(state3.getForces()[1], Vec3(0, 0, 0), 0.01);
+}
+
+void testEnergyEthane() {
+
+    std::string methodName = "testEnergyEthane";
+
+#if 0
+    CudaPlatform platform;
+    CudaFreeEnergyKernelFactory* factory  = new CudaFreeEnergyKernelFactory();
+    platform.registerKernelFactory(CalcNonbondedSoftcoreForceKernel::Name(), factory);
+    platform.registerKernelFactory(CalcGBVISoftcoreForceKernel::Name(), factory);
+    platform.registerKernelFactory(CalcGBSAOBCSoftcoreForceKernel::Name(), factory);
+#else
+
+    ReferencePlatform platform;
+
+    ReferenceFreeEnergyKernelFactory* referenceFactoryT  = new ReferenceFreeEnergyKernelFactory();
+    platform.registerKernelFactory(CalcNonbondedSoftcoreForceKernel::Name(), referenceFactoryT);
+    platform.registerKernelFactory(CalcGBVISoftcoreForceKernel::Name(), referenceFactoryT);
+    platform.registerKernelFactory(CalcGBSAOBCSoftcoreForceKernel::Name(), referenceFactoryT);
+#endif
+
+    ReferencePlatform referencePlatform;
+
+    ReferenceFreeEnergyKernelFactory* referenceFactory  = new ReferenceFreeEnergyKernelFactory();
+
+    referencePlatform.registerKernelFactory(CalcNonbondedSoftcoreForceKernel::Name(), referenceFactory);
+    referencePlatform.registerKernelFactory(CalcGBVISoftcoreForceKernel::Name(), referenceFactory);
+    referencePlatform.registerKernelFactory(CalcGBSAOBCSoftcoreForceKernel::Name(), referenceFactory);
+
+    System system;
+    const int numParticles = 8;
+    for( int i = 0; i < numParticles; i++ ){
+       system.addParticle(1.0);
+    }
+    LangevinIntegrator integrator(0, 0.1, 0.01);
+
+    double C_HBondDistance   = 0.1097;
+    double C_CBondDistance   = 0.1504;
+
+    NonbondedForce* nonbonded = new NonbondedForce();
+    nonbonded->setNonbondedMethod(NonbondedForce::NoCutoff);
+
+    double C_radius, C_gamma, C_charge, H_radius, H_gamma, H_charge;
+
+    int AM1_BCC = 1;
+    H_charge    = -0.053;
+    C_charge    = -3.0*H_charge;
+    if( AM1_BCC ){
+       C_radius =  0.180;
+//C_radius =  0.360;
+       C_gamma  = -0.2863;
+       C_gamma  =  1.0;
+       H_radius =  0.125;
+//H_radius =  0.25;
+       H_gamma  =  0.2437;
+       H_gamma  =  1.0;
+//H_charge = C_charge = 0.0;
+//H_gamma = C_gamma = 0.0;
+    } else {
+       C_radius =  0.215;
+       C_gamma  = -1.1087;
+       H_radius =  0.150;
+       H_gamma  =  0.1237;
+    }
+
+    // for ethane all Coulomb forces are excluded since all atoms 3 or
+    // fewer bonds away from all other atoms -- is this true for H's on
+    // difference carbons? -- should be computed in 14 ixn 
+  
+    int VI = 1;
+    if( VI ){
+
+       //double bornRadiusScaleFactorsEven = 0.5;
+       double bornRadiusScaleFactorsEven = 1.0;
+       //double bornRadiusScaleFactorsOdd  = 0.5;
+       double bornRadiusScaleFactorsOdd  = 1.0;
+#if PRINT_ON == 1
+       (void) fprintf( stderr, "%s: Applying GB/VI\n", methodName.c_str() );
+       (void) fprintf( stderr, "C[%14.7e %14.7e %14.7e] H[%14.7e %14.7e %14.7e] scale[%.1f %.1f]\n",
+                       C_charge, C_radius, C_gamma, H_charge, H_radius, H_gamma,
+                       bornRadiusScaleFactorsEven, bornRadiusScaleFactorsOdd);
+#endif
+
+       GBVISoftcoreForce* forceField             = new GBVISoftcoreForce();
+       for( int i = 0; i < numParticles; i++ ){
+          forceField->addParticle( H_charge, H_radius, H_gamma, (i%2) ? bornRadiusScaleFactorsOdd : bornRadiusScaleFactorsEven);
+          nonbonded->addParticle(  H_charge, H_radius, 0.0);
+       }
+
+       forceField->setParticleParameters( 1, C_charge, C_radius, C_gamma, bornRadiusScaleFactorsOdd);
+       nonbonded->setParticleParameters(  1, C_charge, C_radius, 0.0);
+
+       forceField->setParticleParameters( 4, C_charge, C_radius, C_gamma, bornRadiusScaleFactorsEven);
+       nonbonded->setParticleParameters(  4, C_charge, C_radius, 0.0);
+
+       forceField->addBond( 0, 1, C_HBondDistance );
+       forceField->addBond( 2, 1, C_HBondDistance );
+       forceField->addBond( 3, 1, C_HBondDistance );
+       forceField->addBond( 1, 4, C_CBondDistance );
+       forceField->addBond( 5, 4, C_HBondDistance );
+       forceField->addBond( 6, 4, C_HBondDistance );
+       forceField->addBond( 7, 4, C_HBondDistance );
+   
+       std::vector<pair<int, int> > bonds;
+       std::vector<double> bondDistances;
+   
+       bonds.push_back(pair<int, int>(0, 1));
+       bondDistances.push_back( C_HBondDistance );
+   
+       bonds.push_back(pair<int, int>(2, 1));
+       bondDistances.push_back( C_HBondDistance );
+   
+       bonds.push_back(pair<int, int>(3, 1));
+       bondDistances.push_back( C_HBondDistance );
+   
+       bonds.push_back(pair<int, int>(1, 4));
+       bondDistances.push_back( C_CBondDistance );
+   
+       bonds.push_back(pair<int, int>(5, 4));
+       bondDistances.push_back( C_HBondDistance );
+   
+       bonds.push_back(pair<int, int>(6, 4));
+       bondDistances.push_back( C_HBondDistance );
+   
+       bonds.push_back(pair<int, int>(7, 4));
+       bondDistances.push_back( C_HBondDistance );
+   
+       nonbonded->createExceptionsFromBonds(bonds, 0.0, 0.0);
+
+       system.addForce(forceField);
+
+    } else {
+
+#if PRINT_ON == 1
+       (void) fprintf( stderr, "testEnergyEthane: Applying GBSA OBC\n" );
+#endif
+       GBSAOBCForce* forceField = new GBSAOBCForce();
+       double H_scale           =  0.85;
+       double C_scale           =  0.72;
+       for( int i = 0; i < numParticles; i++ ){
+          forceField->addParticle( H_charge, H_radius, H_scale );
+          nonbonded->addParticle(  H_charge, 1, 0);
+       }
+
+       forceField->setParticleParameters(1, C_charge, C_radius, C_scale);
+       forceField->setParticleParameters(4, C_charge, C_radius, C_scale);
+
+       nonbonded->setParticleParameters( 1, C_charge, C_radius, 0.0);
+       nonbonded->setParticleParameters( 4, C_charge, C_radius, 0.0);
+
+       system.addForce(forceField);
+    }
+
+    system.addForce(nonbonded);
+
+    Context referenceContext(system, integrator, referencePlatform);
+    Context context(system, integrator, platform);
+    
+    vector<Vec3> positions(numParticles);
+    positions[0] = Vec3(0.5480,    1.7661,    0.0000);
+    positions[1] = Vec3(0.7286,    0.8978,    0.6468);
+    positions[2] = Vec3(0.4974,    0.0000,    0.0588);
+    positions[3] = Vec3(0.0000,    0.9459,    1.4666);
+    positions[4] = Vec3(2.1421,    0.8746,    1.1615);
+    positions[5] = Vec3(2.3239,    0.0050,    1.8065);
+    positions[6] = Vec3(2.8705,    0.8295,    0.3416);
+    positions[7] = Vec3(2.3722,    1.7711,    1.7518);
+    context.setPositions(positions);
+    referenceContext.setPositions(positions);
+
+    State state           = context.getState(State::Forces | State::Energy);
+    State referenceState  = referenceContext.getState(State::Forces | State::Energy);
+
+#if PRINT_ON == 1
+    (void) fprintf( stderr, "cudaE=%14.7e refE=%14.7e\n", state.getPotentialEnergy(), referenceState.getPotentialEnergy() );
+#endif
+    
+    // Take a small step in the direction of the energy gradient.
+    
+    if( compareForcesOfTwoStates( numParticles, state, referenceState, 0.001, 0.001 ) ){
+       ASSERT_EQUAL_TOL(0.0, 1.0, 0.01)
+    }
+
+    double norm        = 0.0;
+    double forceSum[3] = { 0.0, 0.0, 0.0 };
+    for (int i = 0; i < numParticles; ++i) {
+        Vec3 f       = state.getForces()[i];
+        norm        += f[0]*f[0] + f[1]*f[1] + f[2]*f[2];
+        forceSum[0] += f[0];
+        forceSum[1] += f[1];
+        forceSum[2] += f[2];
+    }
+    norm               = std::sqrt(norm);
+
+#if PRINT_ON == 1
+    (void) fprintf( stderr, "Fsum [%14.7e %14.7e %14.7e] norm=%14.7e\n", forceSum[0], forceSum[1], forceSum[2], norm );
+#endif
+
+    const double delta = 1e-3;
+    double step        = delta/norm;
+    for (int i = 0; i < numParticles; ++i) {
+        Vec3 p = positions[i];
+        Vec3 f = state.getForces()[i];
+        positions[i] = Vec3(p[0]-f[0]*step, p[1]-f[1]*step, p[2]-f[2]*step);
+    }
+    context.setPositions(positions);
+    
+    State state2 = context.getState(State::Energy);
+
+    double diff  = (state2.getPotentialEnergy()-state.getPotentialEnergy())/delta;
+    double off   = fabs( diff - norm );
+
+#if PRINT_ON == 1
+    (void) fprintf( stderr, "X Energies %.8e %.8e norms[%14.7e %14.7e] deltaNorms=%14.7e delta=%.2e\n",
+                    state.getPotentialEnergy(), state2.getPotentialEnergy(), diff, norm, off, delta );
+#endif
+
+    // See whether the potential energy changed by the expected amount.
+    
+    ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state.getPotentialEnergy())/delta, 0.01)
+}
+
+void testEnergyEthaneSwitchingFunction() {
+
+    std::string methodName = "testEnergyEthaneSwitchingFunction";
+
+#if 0
+    CudaPlatform platform;
+    CudaFreeEnergyKernelFactory* factory  = new CudaFreeEnergyKernelFactory();
+    platform.registerKernelFactory(CalcNonbondedSoftcoreForceKernel::Name(), factory);
+    platform.registerKernelFactory(CalcGBVISoftcoreForceKernel::Name(), factory);
+    platform.registerKernelFactory(CalcGBSAOBCSoftcoreForceKernel::Name(), factory);
+
+#else
+
+    ReferencePlatform platform;
+
+    ReferenceFreeEnergyKernelFactory* referenceFactoryT  = new ReferenceFreeEnergyKernelFactory();
+    platform.registerKernelFactory(CalcNonbondedSoftcoreForceKernel::Name(), referenceFactoryT);
+    platform.registerKernelFactory(CalcGBVISoftcoreForceKernel::Name(), referenceFactoryT);
+    platform.registerKernelFactory(CalcGBSAOBCSoftcoreForceKernel::Name(), referenceFactoryT);
+#endif
+
+    ReferencePlatform referencePlatform;
+    ReferenceFreeEnergyKernelFactory* referenceFactory  = new ReferenceFreeEnergyKernelFactory();
+
+    referencePlatform.registerKernelFactory(CalcNonbondedSoftcoreForceKernel::Name(), referenceFactory);
+    referencePlatform.registerKernelFactory(CalcGBVISoftcoreForceKernel::Name(), referenceFactory);
+    referencePlatform.registerKernelFactory(CalcGBSAOBCSoftcoreForceKernel::Name(), referenceFactory);
+
+    System system;
+    const int numParticles = 9;
+    for( int i = 0; i < numParticles; i++ ){
+       system.addParticle(1.0);
+    }
+    LangevinIntegrator integrator(0, 0.1, 0.01);
+
+    double C_HBondDistance   = 0.1097;
+    double C_CBondDistance   = 0.1504;
+
+    NonbondedSoftcoreForce* nonbonded = new NonbondedSoftcoreForce();
+    nonbonded->setNonbondedMethod(NonbondedSoftcoreForce::NoCutoff);
+
+    double C_radius, C_gamma, C_charge, H_radius, H_gamma, H_charge;
+
+    int AM1_BCC = 1;
+    H_charge    = -0.053;
+    C_charge    = -3.0*H_charge;
+    if( AM1_BCC ){
+       C_radius =  0.180;
+//C_radius =  0.360;
+       C_gamma  = -0.2863;
+       C_gamma  =  1.0;
+       H_radius =  0.125;
+//H_radius =  0.25;
+       H_gamma  =  0.2437;
+       H_gamma  =  1.0;
+//H_charge = C_charge = 0.0;
+//H_gamma = C_gamma = 0.0;
+    } else {
+       C_radius =  0.215;
+       C_gamma  = -1.1087;
+       H_radius =  0.150;
+       H_gamma  =  0.1237;
+    }
+
+    // for ethane all Coulomb forces are excluded since all atoms 3 or
+    // fewer bonds away from all other atoms -- is this true for H's on
+    // difference carbons? -- should be computed in 14 ixn 
+  
+    int VI = 1;
+    if( VI ){
+
+       //double bornRadiusScaleFactorsEven = 0.5;
+       double bornRadiusScaleFactorsEven = 1.0;
+       //double bornRadiusScaleFactorsOdd  = 0.5;
+       double bornRadiusScaleFactorsOdd  = 1.0;
+#if PRINT_ON == 1
+       (void) fprintf( stderr, "%s: Applying GB/VI\n", methodName.c_str() );
+       (void) fprintf( stderr, "C[%14.7e %14.7e %14.7e] H[%14.7e %14.7e %14.7e] scale[%.1f %.1f]\n",
+                       C_charge, C_radius, C_gamma, H_charge, H_radius, H_gamma,
+                       bornRadiusScaleFactorsEven, bornRadiusScaleFactorsOdd);
+#endif
+
+       GBVISoftcoreForce* forceField             = new GBVISoftcoreForce();
+       for( int i = 0; i < numParticles; i++ ){
+          forceField->addParticle( H_charge, H_radius, H_gamma, (i%2) ? bornRadiusScaleFactorsOdd : bornRadiusScaleFactorsEven);
+          nonbonded->addParticle(  H_charge, H_radius, 0.0);
+       }
+
+       forceField->setParticleParameters( 1, C_charge, C_radius, C_gamma, bornRadiusScaleFactorsOdd);
+       nonbonded->setParticleParameters(  1, C_charge, C_radius, 0.0);
+
+       forceField->setParticleParameters( 4, C_charge, C_radius, C_gamma, bornRadiusScaleFactorsEven);
+       nonbonded->setParticleParameters(  4, C_charge, C_radius, 0.0);
+
+       forceField->setParticleParameters( 8, C_charge, (C_radius+0.5), C_gamma, bornRadiusScaleFactorsEven);
+       nonbonded->setParticleParameters(  8, C_charge, C_radius, 0.0);
+
+       forceField->setBornRadiusScalingMethod( GBVISoftcoreForce::NoScaling );
+//       forceField->setBornRadiusScalingMethod( GBVISoftcoreForce::QuinticSpline );
+
+       forceField->addBond( 0, 1, C_HBondDistance );
+       forceField->addBond( 2, 1, C_HBondDistance );
+       forceField->addBond( 3, 1, C_HBondDistance );
+       forceField->addBond( 1, 4, C_CBondDistance );
+       forceField->addBond( 5, 4, C_HBondDistance );
+       forceField->addBond( 6, 4, C_HBondDistance );
+       forceField->addBond( 7, 4, C_HBondDistance );
+   
+       std::vector<pair<int, int> > bonds;
+       std::vector<double> bondDistances;
+   
+       bonds.push_back(pair<int, int>(0, 1));
+       bondDistances.push_back( C_HBondDistance );
+   
+       bonds.push_back(pair<int, int>(2, 1));
+       bondDistances.push_back( C_HBondDistance );
+   
+       bonds.push_back(pair<int, int>(3, 1));
+       bondDistances.push_back( C_HBondDistance );
+   
+       bonds.push_back(pair<int, int>(1, 4));
+       bondDistances.push_back( C_CBondDistance );
+   
+       bonds.push_back(pair<int, int>(5, 4));
+       bondDistances.push_back( C_HBondDistance );
+   
+       bonds.push_back(pair<int, int>(6, 4));
+       bondDistances.push_back( C_HBondDistance );
+   
+       bonds.push_back(pair<int, int>(7, 4));
+       bondDistances.push_back( C_HBondDistance );
+   
+       nonbonded->createExceptionsFromBonds(bonds, 0.0, 0.0);
+
+       system.addForce(forceField);
+
+    } else {
+
+#if PRINT_ON == 1
+       (void) fprintf( stderr, "testEnergyEthane: Applying GBSA OBC\n" );
+#endif
+       GBSAOBCForce* forceField = new GBSAOBCForce();
+       double H_scale           =  0.85;
+       double C_scale           =  0.72;
+       for( int i = 0; i < numParticles; i++ ){
+          forceField->addParticle( H_charge, H_radius, H_scale );
+          nonbonded->addParticle(  H_charge, 1, 0);
+       }
+
+       forceField->setParticleParameters(1, C_charge, C_radius, C_scale);
+       forceField->setParticleParameters(4, C_charge, C_radius, C_scale);
+
+       nonbonded->setParticleParameters( 1, C_charge, C_radius, 0.0);
+       nonbonded->setParticleParameters( 4, C_charge, C_radius, 0.0);
+
+       system.addForce(forceField);
+    }
+
+    system.addForce(nonbonded);
+
+    Context referenceContext(system, integrator, referencePlatform);
+    Context context(system, integrator, platform);
+    
+    vector<Vec3> positions(numParticles);
+    positions[0] = Vec3(0.5480,    1.7661,    0.0000);
+    positions[1] = Vec3(0.7286,    0.8978,    0.6468);
+    positions[2] = Vec3(0.4974,    0.0000,    0.0588);
+    positions[3] = Vec3(0.0000,    0.9459,    1.4666);
+    positions[4] = Vec3(2.1421,    0.8746,    1.1615);
+    positions[5] = Vec3(2.3239,    0.0050,    1.8065);
+    positions[6] = Vec3(2.8705,    0.8295,    0.3416);
+    positions[7] = Vec3(2.3722,    1.7711,    1.7518);
+
+    positions[8] = Vec3(2.1421,    0.8746,    2.1615);
+
+    vector<Vec3> originalPositions(numParticles);
+    for( int ii = 0; ii < numParticles; ii++ ){
+       originalPositions[ii][0] = positions[ii][0];
+       originalPositions[ii][1] = positions[ii][1];
+       originalPositions[ii][2] = positions[ii][2];
+    }
+
+    int tries                = 7;
+    double positionIncrement = 0.15;
+    for( int ii = 0; ii < tries; ii++ ){
+
+       context.setPositions(positions);
+       referenceContext.setPositions(positions);
+   
+       State state           = context.getState(State::Forces | State::Energy);
+       State referenceState  = referenceContext.getState(State::Forces | State::Energy);
+   
+#if PRINT_ON == 1
+       (void) fprintf( stderr, "cudaE=%14.7e refE=%14.7e\n", state.getPotentialEnergy(), referenceState.getPotentialEnergy() );
+#endif
+       
+       // Take a small step in the direction of the energy gradient.
+       
+       if( compareForcesOfTwoStates( numParticles, state, referenceState, 0.001, 0.001 ) ){
+          ASSERT_EQUAL_TOL(0.0, 1.0, 0.01)
+       }
+   
+       double norm        = 0.0;
+       double forceSum[3] = { 0.0, 0.0, 0.0 };
+       for (int i = 0; i < numParticles; ++i) {
+           Vec3 f       = state.getForces()[i];
+           norm        += f[0]*f[0] + f[1]*f[1] + f[2]*f[2];
+           forceSum[0] += f[0];
+           forceSum[1] += f[1];
+           forceSum[2] += f[2];
+       }
+       norm               = std::sqrt(norm);
+   
+#if PRINT_ON == 1
+       (void) fprintf( stderr, "Fsum [%14.7e %14.7e %14.7e] norm=%14.7e\n", forceSum[0], forceSum[1], forceSum[2], norm );
+#endif
+   
+       const double delta = 1e-03;
+       double step        = delta/norm;
+       for (int i = 0; i < numParticles; ++i) {
+           Vec3 p = positions[i];
+           Vec3 f = state.getForces()[i];
+           positions[i] = Vec3(p[0]-f[0]*step, p[1]-f[1]*step, p[2]-f[2]*step);
+       }
+       context.setPositions(positions);
+       
+       State state2 = context.getState(State::Energy);
+   
+       double diff  = (state2.getPotentialEnergy()-state.getPotentialEnergy())/delta;
+       double off   = fabs( diff - norm )/norm;
+   
+#if PRINT_ON == 1
+       (void) fprintf( stderr, "%2d Energies %.8e %.8e norms[%13.7e %13.7e] deltaNorms=%13.7e delta=%.2e\n",
+                       ii, state.getPotentialEnergy(), state2.getPotentialEnergy(), diff, norm, off, delta );
+#endif
+   
+       // See whether the potential energy changed by the expected amount.
+       
+       ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state.getPotentialEnergy())/delta, 1e-3*abs(state.getPotentialEnergy()) );
+
+       if( ii < (tries-1) ){
+           for( int jj = 0; jj < numParticles; jj++ ){
+              positions[jj][0]  = originalPositions[jj][0];
+              positions[jj][1]  = originalPositions[jj][1];
+              positions[jj][2]  = originalPositions[jj][2];
+           }
+       
+           positions[8][2] -=  static_cast<double>(ii+1)*0.1;
+           positions[8][2] -=  0.001;
+           (void) fprintf( stderr, "r48=%14.6e r28=%14.6e r24=%14.6e\n", positions[8][2]-positions[4][2], positions[8][2], positions[4][2] );
+       }
+#if 0
+       int carbonIndex    = 1;
+       int hydrogenIndex  = 0;
+       while( hydrogenIndex < 8 ){
+          Vec3 carbonDelta;
+          for( int kk = 0; kk < 3; kk++ ){
+             positions[hydrogenIndex][kk] += positionIncrement*(positions[carbonIndex][kk] - positions[hydrogenIndex][kk] );
+          }
+          double dist = 0.0;
+          for( int kk = 0; kk < 3; kk++ ){
+             dist += (positions[carbonIndex][kk] - positions[hydrogenIndex][kk] )*(positions[carbonIndex][kk] - positions[hydrogenIndex][kk]);
+          }
+           (void) fprintf( stderr, "H=%d C=%d r=%14.6e\n", hydrogenIndex, carbonIndex, dist );
+          hydrogenIndex++;
+          if( hydrogenIndex == carbonIndex ){
+             hydrogenIndex++;
+          }
+          if( carbonIndex == 1 && hydrogenIndex == 4 ){
+             carbonIndex    = 4;
+             hydrogenIndex  = 5;
+          }
+       }
+#endif
+
+   }
+}
+
+void testTwoParticleEnergyEthaneSwitchingFunction() {
+
+    std::string methodName = "testTwoParticleEnergyEthaneSwitchingFunction";
+
+#if 0
+    CudaPlatform platform;
+    CudaFreeEnergyKernelFactory* factory  = new CudaFreeEnergyKernelFactory();
+    platform.registerKernelFactory(CalcNonbondedSoftcoreForceKernel::Name(), factory);
+    platform.registerKernelFactory(CalcGBVISoftcoreForceKernel::Name(), factory);
+    platform.registerKernelFactory(CalcGBSAOBCSoftcoreForceKernel::Name(), factory);
+#else
+
+    ReferencePlatform platform;
+
+    ReferenceFreeEnergyKernelFactory* referenceFactoryT  = new ReferenceFreeEnergyKernelFactory();
+    platform.registerKernelFactory(CalcNonbondedSoftcoreForceKernel::Name(), referenceFactoryT);
+    platform.registerKernelFactory(CalcGBVISoftcoreForceKernel::Name(), referenceFactoryT);
+    platform.registerKernelFactory(CalcGBSAOBCSoftcoreForceKernel::Name(), referenceFactoryT);
+#endif
+
+    ReferencePlatform referencePlatform;
+    ReferenceFreeEnergyKernelFactory* referenceFactory  = new ReferenceFreeEnergyKernelFactory();
+
+    referencePlatform.registerKernelFactory(CalcNonbondedSoftcoreForceKernel::Name(), referenceFactory);
+    referencePlatform.registerKernelFactory(CalcGBVISoftcoreForceKernel::Name(), referenceFactory);
+    referencePlatform.registerKernelFactory(CalcGBSAOBCSoftcoreForceKernel::Name(), referenceFactory);
+
+    System system;
+    const int numParticles = 3;
+    for( int i = 0; i < numParticles; i++ ){
+       system.addParticle(1.0);
+    }
+    LangevinIntegrator integrator(0, 0.1, 0.01);
+
+    double C_HBondDistance   = 0.1097;
+    double C_CBondDistance   = 0.1504;
+
+    NonbondedForce* nonbonded = new NonbondedForce();
+    nonbonded->setNonbondedMethod(NonbondedForce::NoCutoff);
+
+    double C_radius, C_gamma, C_charge, H_radius, H_gamma, H_charge;
+
+    int AM1_BCC = 1;
+    H_charge    = -0.053;
+    C_charge    = -3.0*H_charge;
+    if( AM1_BCC ){
+       C_radius =  0.180;
+//C_radius =  0.360;
+       C_gamma  = -0.2863;
+       C_gamma  =  1.0;
+       H_radius =  0.125;
+//H_radius =  0.25;
+       H_gamma  =  0.2437;
+       H_gamma  =  1.0;
+//H_charge = C_charge = 0.0;
+//H_gamma = C_gamma = 0.0;
+    } else {
+       C_radius =  0.215;
+       C_gamma  = -1.1087;
+       H_radius =  0.150;
+       H_gamma  =  0.1237;
+    }
+
+    // for ethane all Coulomb forces are excluded since all atoms 3 or
+    // fewer bonds away from all other atoms -- is this true for H's on
+    // difference carbons? -- should be computed in 14 ixn 
+  
+    int VI = 1;
+    if( VI ){
+
+       //double bornRadiusScaleFactorsEven = 0.5;
+       double bornRadiusScaleFactorsEven = 1.0;
+       //double bornRadiusScaleFactorsOdd  = 0.5;
+       double bornRadiusScaleFactorsOdd  = 1.0;
+#if PRINT_ON == 1
+       (void) fprintf( stderr, "%s: Applying GB/VI\n", methodName.c_str() );
+       (void) fprintf( stderr, "C[%14.7e %14.7e %14.7e] H[%14.7e %14.7e %14.7e] scale[%.1f %.1f]\n",
+                       C_charge, C_radius, C_gamma, H_charge, H_radius, H_gamma,
+                       bornRadiusScaleFactorsEven, bornRadiusScaleFactorsOdd);
+#endif
+
+       GBVISoftcoreForce* forceField             = new GBVISoftcoreForce();
+       for( int i = 0; i < numParticles; i++ ){
+          forceField->addParticle( H_charge, H_radius, H_gamma, (i%2) ? bornRadiusScaleFactorsOdd : bornRadiusScaleFactorsEven);
+          nonbonded->addParticle(  H_charge, H_radius, 0.0);
+       }
+
+       forceField->setParticleParameters( 0, C_charge, C_radius, C_gamma, bornRadiusScaleFactorsOdd);
+       nonbonded->setParticleParameters(  0, C_charge, C_radius, 0.0);
+
+       forceField->setParticleParameters( 1, C_charge, C_radius, C_gamma, bornRadiusScaleFactorsOdd);
+       nonbonded->setParticleParameters(  1, C_charge, C_radius, 0.0);
+
+       forceField->setParticleParameters( 2, C_charge, C_radius, C_gamma, bornRadiusScaleFactorsOdd);
+       nonbonded->setParticleParameters(  2, C_charge, C_radius, 0.0);
+
+       system.addForce(forceField);
+
+    } else {
+
+#if PRINT_ON == 1
+       (void) fprintf( stderr, "testEnergyEthane: Applying GBSA OBC\n" );
+#endif
+       GBSAOBCForce* forceField = new GBSAOBCForce();
+       double H_scale           =  0.85;
+       double C_scale           =  0.72;
+       for( int i = 0; i < numParticles; i++ ){
+          forceField->addParticle( H_charge, H_radius, H_scale );
+          nonbonded->addParticle(  H_charge, 1, 0);
+       }
+
+       forceField->setParticleParameters(1, C_charge, C_radius, C_scale);
+       forceField->setParticleParameters(4, C_charge, C_radius, C_scale);
+
+       nonbonded->setParticleParameters( 1, C_charge, C_radius, 0.0);
+       nonbonded->setParticleParameters( 4, C_charge, C_radius, 0.0);
+
+       system.addForce(forceField);
+    }
+
+    system.addForce(nonbonded);
+
+    Context referenceContext(system, integrator, referencePlatform);
+    Context context(system, integrator, platform);
+    
+    vector<Vec3> positions(numParticles);
+    positions[0] = Vec3( 0.0000,    0.0000,    0.0000);
+    positions[1] = Vec3( 1.0000,    0.0000,    0.0000);
+    positions[2] = Vec3(-1.0000,    0.0000,    0.0000);
+
+    vector<Vec3> originalPositions(numParticles);
+    for( int ii = 0; ii < numParticles; ii++ ){
+       originalPositions[ii][0] = positions[ii][0];
+       originalPositions[ii][1] = positions[ii][1];
+       originalPositions[ii][2] = positions[ii][2];
+    }
+
+    int tries                = 11;
+    double positionIncrement = 0.15;
+    for( int ii = 0; ii < tries; ii++ ){
+
+       context.setPositions(positions);
+       referenceContext.setPositions(positions);
+   
+       State state           = context.getState(State::Forces | State::Energy);
+       State referenceState  = referenceContext.getState(State::Forces | State::Energy);
+   
+#if PRINT_ON == 1
+       (void) fprintf( stderr, "cudaE=%14.7e refE=%14.7e\n", state.getPotentialEnergy(), referenceState.getPotentialEnergy() );
+#endif
+       
+       // Take a small step in the direction of the energy gradient.
+       
+       if( compareForcesOfTwoStates( numParticles, state, referenceState, 0.001, 0.001 ) ){
+          ASSERT_EQUAL_TOL(0.0, 1.0, 0.01)
+       }
+   
+       double norm        = 0.0;
+       double forceSum[3] = { 0.0, 0.0, 0.0 };
+       for (int i = 0; i < numParticles; ++i) {
+           Vec3 f       = state.getForces()[i];
+           norm        += f[0]*f[0] + f[1]*f[1] + f[2]*f[2];
+           forceSum[0] += f[0];
+           forceSum[1] += f[1];
+           forceSum[2] += f[2];
+       }
+       norm               = std::sqrt(norm);
+   
+#if PRINT_ON == 1
+       (void) fprintf( stderr, "Fsum [%14.7e %14.7e %14.7e] norm=%14.7e\n", forceSum[0], forceSum[1], forceSum[2], norm );
+#endif
+   
+       const double delta = 1e-3;
+       double step        = delta/norm;
+       for (int i = 0; i < numParticles; ++i) {
+           Vec3 p = positions[i];
+           Vec3 f = state.getForces()[i];
+           positions[i] = Vec3(p[0]-f[0]*step, p[1]-f[1]*step, p[2]-f[2]*step);
+       }
+       context.setPositions(positions);
+       
+       State state2 = context.getState(State::Energy);
+   
+       double diff  = (state2.getPotentialEnergy()-state.getPotentialEnergy())/delta;
+       double off   = fabs( diff - norm );
+   
+#if PRINT_ON == 1
+       (void) fprintf( stderr, "X Energies %.8e %.8e norms[%14.7e %14.7e] deltaNorms=%14.7e delta=%.2e\n",
+                       state.getPotentialEnergy(), state2.getPotentialEnergy(), diff, norm, off, delta );
+#endif
+   
+       // See whether the potential energy changed by the expected amount.
+       
+//       ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state.getPotentialEnergy())/delta, 0.01)
+
+       for( int jj = 0; jj < numParticles; jj++ ){
+          positions[jj][0]  = originalPositions[jj][0];
+          positions[jj][1]  = originalPositions[jj][1];
+          positions[jj][2]  = originalPositions[jj][2];
+       }
+   
+       positions[1][0] -=  static_cast<double>(ii+1)*0.1;
+       positions[2][0] +=  static_cast<double>(ii+1)*0.1;
+       positions[1][0] -=  0.001;
+       positions[2][0] +=  0.001;
+       (void) fprintf( stderr, "r12=%14.6e\n", positions[1][0]);
+#if 0
+       int carbonIndex    = 1;
+       int hydrogenIndex  = 0;
+       while( hydrogenIndex < 8 ){
+          Vec3 carbonDelta;
+          for( int kk = 0; kk < 3; kk++ ){
+             positions[hydrogenIndex][kk] += positionIncrement*(positions[carbonIndex][kk] - positions[hydrogenIndex][kk] );
+          }
+          double dist = 0.0;
+          for( int kk = 0; kk < 3; kk++ ){
+             dist += (positions[carbonIndex][kk] - positions[hydrogenIndex][kk] )*(positions[carbonIndex][kk] - positions[hydrogenIndex][kk]);
+          }
+           (void) fprintf( stderr, "H=%d C=%d r=%14.6e\n", hydrogenIndex, carbonIndex, dist );
+          hydrogenIndex++;
+          if( hydrogenIndex == carbonIndex ){
+             hydrogenIndex++;
+          }
+          if( carbonIndex == 1 && hydrogenIndex == 4 ){
+             carbonIndex    = 4;
+             hydrogenIndex  = 5;
+          }
+       }
+#endif
+
+   }
+}
+
+void testEnergyTwoParticle() {
+
+    CudaPlatform platform;
+    const int numParticles = 2;
+    System system;
+    for( int i = 0; i < numParticles; i++ ){
+       system.addParticle(1.0);
+    }
+    LangevinIntegrator integrator(0, 0.1, 0.01);
+
+    //void HarmonicBondForce::getBondParameters(int index, int& particle1, int& particle2, double& length, double& k)
+    double C_HBondDistance   = 3.0;
+
+    double C_radius, C_gamma, C_charge, H_radius, H_gamma, H_charge;
+/*
+    H_charge    = -1.0;
+    C_charge    =  1.0;
+
+    H_gamma     =  1.0;
+    C_gamma     =  1.0;
+
+    H_radius    =  1.0;
+    C_radius    =  1.0;
+*/ 
+    H_charge    = -0.5;
+    C_charge    =  0.5;
+
+    H_gamma     =  0.5;
+    C_gamma     =  0.5;
+
+    H_radius    =  0.15;
+    C_radius    =  0.15;
+ 
+    int VI = 1;
+    if( VI ){
+       (void) fprintf( stderr, "Applying GB/VI\n" );
+       GBVISoftcoreForce* forceField = new GBVISoftcoreForce();
+       forceField->addParticle( H_charge, H_radius, H_gamma);
+       forceField->addParticle( C_charge, C_radius, C_gamma);
+       system.addForce(forceField);
+    } else {
+       (void) fprintf( stderr, "Applying GBSA OBC\n" );
+       GBSAOBCForce* forceField = new GBSAOBCForce();
+       forceField->addParticle( H_charge, H_radius, 0.8);
+       forceField->addParticle( C_charge, C_radius, 0.8);
+       system.addForce(forceField);
+    }
+
+    NonbondedForce* nonbonded = new NonbondedForce();
+    for (int i = 0; i < numParticles; ++i) {
+        double charge = i%2 == 0 ? -1 : 1;
+        nonbonded->addParticle( charge, 1, 0);
+    }
+    nonbonded->setNonbondedMethod(NonbondedForce::NoCutoff);
+    system.addForce(nonbonded);
+
+    Context context(system, integrator, platform);
+    
+    vector<Vec3> positions(numParticles);
+    positions[0] = Vec3(         0.0000,    0.0000,    0.0000);
+    positions[1] = Vec3(C_HBondDistance,    0.0000,    0.0000);
+    context.setPositions(positions);
+
+    State state = context.getState(State::Forces | State::Energy);
+    
+    // Take a small step in the direction of the energy gradient.
+    
+    double norm        = 0.0;
+    double forceSum[3] = { 0.0, 0.0, 0.0 };
+    for (int i = 0; i < numParticles; ++i) {
+        Vec3 f  = state.getForces()[i];
+#if PRINT_ON == 1
+        (void) fprintf( stderr, "F %d [%14.6e %14.6e %14.6e]\n", i, f[0], f[1], f[2] );
+#endif
+        norm   += f[0]*f[0] + f[1]*f[1] + f[2]*f[2];
+        forceSum[0] += f[0];
+        forceSum[1] += f[1];
+        forceSum[2] += f[2];
+    }
+    norm               = std::sqrt(norm);
+
+#if PRINT_ON == 1
+    (void) fprintf( stderr, "Fsum [%14.6e %14.6e %14.6e] norm=%14.6e\n", forceSum[0], forceSum[1], forceSum[2], norm );
+#endif
+
+    const double delta = 1e-4;
+    double step = delta/norm;
+    for (int i = 0; i < numParticles; ++i) {
+        Vec3 p = positions[i];
+        Vec3 f = state.getForces()[i];
+        positions[i] = Vec3(p[0]-f[0]*step, p[1]-f[1]*step, p[2]-f[2]*step);
+    }
+    context.setPositions(positions);
+    
+    State state2 = context.getState(State::Energy);
+
+    double diff = fabs( norm - (state2.getPotentialEnergy()-state.getPotentialEnergy())/delta );
+#if PRINT_ON == 1
+    (void) fprintf( stderr, "Energies %14.6e %14.6e diff=%14.6e [%14.6e %14.6e]\n",
+                    state.getPotentialEnergy(), state2.getPotentialEnergy(), diff, norm, (state2.getPotentialEnergy()-state.getPotentialEnergy())/delta );
+#endif
+
+    // See whether the potential energy changed by the expected amount.
+    
+    ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state.getPotentialEnergy())/delta, 0.01)
+}
+
+
+void testEnergyManyParticles( int numParticles ) {
+
+    CudaPlatform platform;
+    System system;
+    for( int i = 0; i < numParticles; i++ ){
+       system.addParticle(1.0);
+    }
+    LangevinIntegrator integrator(0, 0.1, 0.01);
+
+    //void HarmonicBondForce::getBondParameters(int index, int& particle1, int& particle2, double& length, double& k)
+/* 
+    double C_HBondDistance   = 3.0;
+    HarmonicBondForce* bonds = new HarmonicBondForce(numParticles-1);
+    for( int ii = 1; ii < numParticles; ii++ ){
+       bonds->setBondParameters(ii-1, ii-1, ii, C_HBondDistance, 0.0); 
+    }
+    system.addForce(bonds);
+*/
+
+    double C_radius, C_gamma, C_charge, H_radius, H_gamma, H_charge;
+/*
+    H_charge    = -1.0;
+    C_charge    =  1.0;
+
+    H_gamma     =  1.0;
+    C_gamma     =  1.0;
+
+    H_radius    =  1.0;
+    C_radius    =  1.0;
+*/ 
+    H_charge    = -0.5;
+    C_charge    =  0.5;
+
+    H_gamma     =  0.5;
+    C_gamma     =  0.5;
+
+    H_radius    =  0.15;
+    C_radius    =  0.15;
+ 
+    NonbondedForce* nonbonded = new NonbondedForce();
+    nonbonded->setNonbondedMethod(NonbondedForce::NoCutoff);
+
+    int VI = 0;
+    if( VI ){
+
+#if PRINT_ON == 1
+       (void) fprintf( stderr, "testEnergyManyParticles: Applying GB/VI\n" );
+#endif
+       GBVISoftcoreForce* forceField = new GBVISoftcoreForce();
+       for( int ii = 0; ii < numParticles; ii++ ){
+          forceField->addParticle( H_charge, H_radius, H_gamma);
+          nonbonded->addParticle( H_charge, H_radius, 0.0);
+       }
+       system.addForce(forceField);
+    } else {
+#if PRINT_ON == 1
+       (void) fprintf( stderr, "testEnergyManyParticles: Applying GBSA OBC\n" );
+#endif
+       GBSAOBCForce* forceField = new GBSAOBCForce();
+       for( int ii = 0; ii < numParticles; ii++ ){
+          forceField->addParticle(H_charge, H_radius, 0.8);
+          nonbonded->addParticle( H_charge, H_radius, 0.0);
+       }
+       system.addForce(forceField);
+    }
+    system.addForce(nonbonded);
+
+    Context context(system, integrator, platform);
+    vector<Vec3> positions(numParticles);
+    for( int ii = 0; ii < numParticles; ii++ ){
+       positions[ii] = Vec3(  (double) ii,    0.0000,    0.0000);
+    }
+    context.setPositions(positions);
+
+    //State state = context.getState(State::Forces | State::Energy);
+    
+   /* 
+    // Take a small step in the direction of the energy gradient.
+    
+    double norm        = 0.0;
+    double forceSum[3] = { 0.0, 0.0, 0.0 };
+    for (int i = 0; i < numParticles; ++i) {
+        Vec3 f  = state.getForces()[i];
+        (void) fprintf( stderr, "F %d [%14.6e %14.6e %14.6e]\n", i, f[0], f[1], f[2] );
+        norm   += f[0]*f[0] + f[1]*f[1] + f[2]*f[2];
+        forceSum[0] += f[0];
+        forceSum[1] += f[1];
+        forceSum[2] += f[2];
+    }
+    norm               = std::sqrt(norm);
+    (void) fprintf( stderr, "Fsum [%14.6e %14.6e %14.6e] norm=%14.6e\n", forceSum[0], forceSum[1], forceSum[2], norm );
+
+    const double delta = 1e-4;
+    double step = delta/norm;
+    for (int i = 0; i < numParticles; ++i) {
+        Vec3 p = positions[i];
+        Vec3 f = state.getForces()[i];
+        positions[i] = Vec3(p[0]-f[0]*step, p[1]-f[1]*step, p[2]-f[2]*step);
+    }
+    context.setPositions(positions);
+    
+    State state2 = context.getState(State::Energy);
+
+    double diff = fabs( norm - (state2.getPotentialEnergy()-state.getPotentialEnergy())/delta );
+    (void) fprintf( stderr, "Energies %14.6e %14.6e diff=%14.6e [%14.6e %14.6e]\n",
+                    state.getPotentialEnergy(), state2.getPotentialEnergy(), diff, norm, (state2.getPotentialEnergy()-state.getPotentialEnergy())/delta );
+
+    // See whether the potential energy changed by the expected amount.
+    
+    ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state.getPotentialEnergy())/delta, 0.01)
+*/
+}
+
+
+void testForce(int numParticles, NonbondedForce::NonbondedMethod method) {
+    CudaPlatform cuda;
+    ReferencePlatform reference;
+    System system;
+    LangevinIntegrator integrator(0, 0.1, 0.01);
+
+    GBVISoftcoreForce* gbsa    = new GBVISoftcoreForce();
+    NonbondedForce* nonbonded  = new NonbondedForce();
+
+    double radius              = 0.15;
+    double gamma               = 0.0;
+
+    for (int i = 0; i < numParticles; ++i) {
+        system.addParticle(1.0);
+        double charge = i%2 == 0 ? -1 : 1;
+        gbsa->addParticle(charge, radius, gamma);
+        nonbonded->addParticle(charge, 1, 0);
+    }
+
+    nonbonded->setNonbondedMethod(method);
+    nonbonded->setCutoffDistance(3.0);
+
+    int grid = (int) floor(0.5+pow(numParticles, 1.0/3.0));
+    if (method == NonbondedForce::CutoffPeriodic) {
+        double boxSize = (grid+1)*2.0;
+        system.setPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
+    }
+    system.addForce(gbsa);
+    system.addForce(nonbonded);
+
+    Context context(system, integrator, cuda);
+    Context refContext(system, integrator, reference);
+    
+    // Set random (but uniformly distributed) positions for all the particles.
+    
+    vector<Vec3> positions(numParticles);
+    init_gen_rand(0);
+    for (int i = 0; i < grid; i++)
+        for (int j = 0; j < grid; j++)
+            for (int k = 0; k < grid; k++)
+                //positions[i*grid*grid+j*grid+k] = Vec3(i*2.0, j*2.0, k*2.0);
+                positions[i*grid*grid+j*grid+k] = Vec3(i*0.5, j*0.5, k*0.5);
+    for (int i = 0; i < numParticles; ++i)
+        positions[i] = positions[i] + Vec3(0.5*genrand_real2(), 0.5*genrand_real2(), 0.5*genrand_real2());
+    context.setPositions(positions);
+    refContext.setPositions(positions);
+    State state = context.getState(State::Forces | State::Energy);
+    State refState = refContext.getState(State::Forces | State::Energy);
+
+    // Make sure the Cuda and Reference platforms agree.
+
+    double norm = 0.0;
+    double diff = 0.0;
+    for (int i = 0; i < numParticles; ++i) {
+        Vec3 f = state.getForces()[i];
+        norm += f[0]*f[0] + f[1]*f[1] + f[2]*f[2];
+        Vec3 delta = f-refState.getForces()[i];
+        Vec3 g = refState.getForces()[i];
+#if PRINT_ON == 1
+fprintf( stderr, "FFF %d fcud[%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n", i, f[0], f[1], f[2], g[0], g[1], g[2] );
+        diff += delta[0]*delta[0] + delta[1]*delta[1] + delta[2]*delta[2];
+#endif
+    }
+    norm = std::sqrt(norm);
+    diff = std::sqrt(diff);
+
+#if PRINT_ON == 1
+    (void) fprintf( stderr, "F norm%14.6e diff w/ ref=%14.6e\n", norm, diff );
+#endif
+    ASSERT_EQUAL_TOL(0.0, diff, 0.001*norm); 
+
+    // Take a small step in the direction of the energy gradient.  (This doesn't work with cutoffs, since the energy
+    // changes discontinuously at the cutoff distance.)
+
+    if (method == NonbondedForce::NoCutoff)
+    {
+        const double delta = 1e-2;
+        double step = delta/norm;
+        for (int i = 0; i < numParticles; ++i) {
+            Vec3 p = positions[i];
+            Vec3 f = state.getForces()[i];
+            positions[i] = Vec3(p[0]-f[0]*step, p[1]-f[1]*step, p[2]-f[2]*step);
+        }
+        context.setPositions(positions);
+
+        // See whether the potential energy changed by the expected amount.
+
+        State state2 = context.getState(State::Energy);
+        ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state.getPotentialEnergy())/delta, 1e-3*abs(state.getPotentialEnergy()));
+    }
+}
+
+int main() {
+    try {
+//        testEnergyEthane();
+        testEnergyEthaneSwitchingFunction();
+//        testTwoParticleEnergyEthaneSwitchingFunction();
+//        testSingleParticle();
+//        testCutoffAndPeriodic();
+//        testEnergyTwoParticle();
+//       for (int i = 2; i < 8; i++) {
+//            testForce(i*i*i, NonbondedForce::NoCutoff);
+//        }
+
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}
--- a/plugins/freeEnergy/platforms/reference/include/ReferenceFreeEnergyKernelFactory.h
+++ b/plugins/freeEnergy/platforms/reference/include/ReferenceFreeEnergyKernelFactory.h
+#ifndef OPENMM_REFERENCE_FREE_ENERGY_KERNEL_FACTORY_H_
+#define OPENMM_REFERENCE_FREE_ENERGY_KERNEL_FACTORY_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "openmm/KernelFactory.h"
+
+namespace OpenMM {
+
+/**
+ * This KernelFactory creates all kernels for ReferencePlatform.
+ */
+
+class ReferenceFreeEnergyKernelFactory : public KernelFactory {
+public:
+    KernelImpl* createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_REFERENCE_FREE_ENERGY_KERNEL_FACTORY_H_*/
--- a/plugins/freeEnergy/platforms/reference/src/ReferenceFreeEnergyKernelFactory.cpp
+++ b/plugins/freeEnergy/platforms/reference/src/ReferenceFreeEnergyKernelFactory.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "ReferenceFreeEnergyKernelFactory.h"
+#include "ReferenceFreeEnergyKernels.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/OpenMMException.h"
+
+// using PluginInitializer.h and initOpenMMPlugin() does not seem to work
+//#include "openmm/PluginInitializer.h"
+
+#if defined(OPENMM_BUILDING_SHARED_LIBRARY)
+    #if defined(WIN32)
+      #include <windows.h>
+        extern "C" void initOpenMMReferenceFreeEnergyPlugin();
+        BOOL WINAPI DllMain(HANDLE hModule, DWORD  ul_reason_for_call, LPVOID lpReserved) {
+            if (ul_reason_for_call == DLL_PROCESS_ATTACH)
+                initOpenMMReferenceFreeEnergyPlugin();
+            return TRUE;
+        }
+    #else
+        extern "C" void __attribute__((constructor)) initOpenMMReferenceFreeEnergyPlugin();
+    #endif
+#endif
+
+using namespace OpenMM;
+
+extern "C" void initOpenMMReferenceFreeEnergyPlugin() {
+
+    ReferencePlatform* referencePlatform       = new ReferencePlatform();
+
+    ReferenceFreeEnergyKernelFactory* factory  = new ReferenceFreeEnergyKernelFactory();
+    referencePlatform->registerKernelFactory(CalcNonbondedSoftcoreForceKernel::Name(), factory);
+    referencePlatform->registerKernelFactory(CalcGBSAOBCSoftcoreForceKernel::Name(), factory);
+    referencePlatform->registerKernelFactory(CalcGBVISoftcoreForceKernel::Name(), factory);
+
+    Platform::registerPlatform(referencePlatform);
+   
+}
+
+KernelImpl* ReferenceFreeEnergyKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
+    ReferencePlatform::PlatformData& data = *static_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
+
+    if (name == CalcNonbondedSoftcoreForceKernel::Name())
+        return new ReferenceFreeEnergyCalcNonbondedSoftcoreForceKernel(name, platform);
+
+    if (name == CalcGBSAOBCSoftcoreForceKernel::Name())
+        return new ReferenceFreeEnergyCalcGBSAOBCSoftcoreForceKernel(name, platform);
+
+    if (name == CalcGBVISoftcoreForceKernel::Name())
+        return new ReferenceFreeEnergyCalcGBVISoftcoreForceKernel(name, platform);
+
+    throw OpenMMException( (std::string("Tried to create kernel with illegal kernel name '") + name + "'").c_str() );
+}
--- a/plugins/freeEnergy/platforms/reference/src/ReferenceFreeEnergyKernels.cpp
+++ b/plugins/freeEnergy/platforms/reference/src/ReferenceFreeEnergyKernels.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "ReferenceFreeEnergyKernels.h"
+#include "gbsa/CpuGBVISoftcore.h"
+#include "gbsa/CpuObcSoftcore.h"
+#include "SimTKReference/ReferenceFreeEnergyLJCoulomb14Softcore.h"
+#include "SimTKReference/ReferenceFreeEnergyLJCoulombSoftcoreIxn.h"
+#include "ReferenceBondForce.h"
+#include "openmm/System.h"
+#include "openmm/internal/ContextImpl.h"
+#include "SimTKUtilities/SimTKOpenMMUtilities.h"
+#include <cmath>
+#include <limits>
+
+using namespace OpenMM;
+
+static int** allocateIntArray(int length, int width) {
+    int** array = new int*[length];
+    for (int i = 0; i < length; ++i)
+        array[i] = new int[width];
+    return array;
+}
+
+static RealOpenMM** allocateRealArray(int length, int width) {
+    RealOpenMM** array = new RealOpenMM*[length];
+    for (int i = 0; i < length; ++i)
+        array[i] = new RealOpenMM[width];
+    return array;
+}
+
+static int** copyToArray(const std::vector<std::vector<int> > vec) {
+    if (vec.size() == 0)
+        return new int*[0];
+    int** array = allocateIntArray(vec.size(), vec[0].size());
+    for (size_t i = 0; i < vec.size(); ++i)
+        for (size_t j = 0; j < vec[i].size(); ++j)
+            array[i][j] = vec[i][j];
+    return array;
+}
+
+static RealOpenMM** copyToArray(const std::vector<std::vector<double> > vec) {
+    if (vec.size() == 0)
+        return new RealOpenMM*[0];
+    RealOpenMM** array = allocateRealArray(vec.size(), vec[0].size());
+    for (size_t i = 0; i < vec.size(); ++i)
+        for (size_t j = 0; j < vec[i].size(); ++j)
+            array[i][j] = static_cast<RealOpenMM>(vec[i][j]);
+    return array;
+}
+
+static void disposeIntArray(int** array, int size) {
+    if (array) {
+        for (int i = 0; i < size; ++i)
+            delete[] array[i];
+        delete[] array;
+    }
+}
+
+static void disposeRealArray(RealOpenMM** array, int size) {
+    if (array) {
+        for (int i = 0; i < size; ++i)
+            delete[] array[i];
+        delete[] array;
+    }
+}
+
+static RealOpenMM** extractPositions(ContextImpl& context) {
+    ReferencePlatform::PlatformData* data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
+    return (RealOpenMM**) data->positions;
+}
+
+static RealOpenMM** extractVelocities(ContextImpl& context) {
+    ReferencePlatform::PlatformData* data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
+    return (RealOpenMM**) data->velocities;
+}
+
+static RealOpenMM** extractForces(ContextImpl& context) {
+    ReferencePlatform::PlatformData* data = reinterpret_cast<ReferencePlatform::PlatformData*>(context.getPlatformData());
+    return (RealOpenMM**) data->forces;
+}
+
+ReferenceFreeEnergyCalcNonbondedSoftcoreForceKernel::~ReferenceFreeEnergyCalcNonbondedSoftcoreForceKernel() {
+    disposeRealArray(particleParamArray, numParticles);
+    disposeIntArray(exclusionArray, numParticles);
+    disposeIntArray(bonded14IndexArray, num14);
+    disposeRealArray(bonded14ParamArray, num14);
+    if (neighborList != NULL)
+        delete neighborList;
+}
+
+void ReferenceFreeEnergyCalcNonbondedSoftcoreForceKernel::initialize(const System& system, const NonbondedSoftcoreForce& force) {
+
+    // Identify which exceptions are 1-4 interactions.
+
+    numParticles = force.getNumParticles();
+    exclusions.resize(numParticles);
+    std::vector<int> nb14s;
+    for (int i = 0; i < force.getNumExceptions(); i++) {
+        int particle1, particle2;
+        double chargeProd, sigma, epsilon, softcoreLJLambda;
+        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon, softcoreLJLambda);
+        exclusions[particle1].insert(particle2);
+        exclusions[particle2].insert(particle1);
+        if (chargeProd != 0.0 || epsilon != 0.0)
+            nb14s.push_back(i);
+    }
+
+    // Build the arrays.
+
+    particleParamArray = allocateRealArray(numParticles, 4);
+    RealOpenMM sqrtEps = static_cast<RealOpenMM>( std::sqrt(138.935485) );
+    for (int i = 0; i < numParticles; ++i) {
+        double charge, radius, depth, softcoreLJLambda;
+        force.getParticleParameters(i, charge, radius, depth,softcoreLJLambda);
+        particleParamArray[i][0] = static_cast<RealOpenMM>(0.5*radius);
+        particleParamArray[i][1] = static_cast<RealOpenMM>(2.0*sqrt(depth));
+        particleParamArray[i][2] = static_cast<RealOpenMM>(charge*sqrtEps);
+        particleParamArray[i][3] = static_cast<RealOpenMM>(softcoreLJLambda);
+    }
+
+    this->exclusions = exclusions;
+    exclusionArray   = new int*[numParticles];
+    for (int i = 0; i < numParticles; ++i) {
+        exclusionArray[i]    = new int[exclusions[i].size()+1];
+        exclusionArray[i][0] = exclusions[i].size();
+        int index = 0;
+        for (std::set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter)
+            exclusionArray[i][++index] = *iter;
+    }
+
+    num14              = nb14s.size();
+    bonded14IndexArray = allocateIntArray(num14, 2);
+    bonded14ParamArray = allocateRealArray(num14, 4);
+    for (int i = 0; i < num14; ++i) {
+        int particle1, particle2;
+        double charge, radius, depth, softcoreLJLambda;
+        force.getExceptionParameters(nb14s[i], particle1, particle2, charge, radius, depth, softcoreLJLambda);
+        bonded14IndexArray[i][0] = particle1;
+        bonded14IndexArray[i][1] = particle2;
+        bonded14ParamArray[i][0] = static_cast<RealOpenMM>(radius);
+        bonded14ParamArray[i][1] = static_cast<RealOpenMM>(4.0*depth);
+        bonded14ParamArray[i][2] = static_cast<RealOpenMM>(charge*sqrtEps*sqrtEps);
+        bonded14ParamArray[i][3] = static_cast<RealOpenMM>(softcoreLJLambda);
+    }
+
+    nonbondedMethod  = CalcNonbondedSoftcoreForceKernel::NonbondedSoftcoreMethod(force.getNonbondedMethod());
+    nonbondedCutoff  = (RealOpenMM) force.getCutoffDistance();
+    //softCoreLJLambda = (RealOpenMM) force.getSoftCoreLJLambda();
+
+    Vec3 boxVectors[3];
+    system.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+    periodicBoxSize[0] = (RealOpenMM) boxVectors[0][0];
+    periodicBoxSize[1] = (RealOpenMM) boxVectors[1][1];
+    periodicBoxSize[2] = (RealOpenMM) boxVectors[2][2];
+
+    if (nonbondedMethod == NoCutoff)
+        neighborList = NULL;
+    else
+        neighborList = new NeighborList();
+
+#if 0
+    if (nonbondedMethod == Ewald || nonbondedMethod == PME) {
+        RealOpenMM ewaldErrorTol = (RealOpenMM) force.getEwaldErrorTolerance();
+        ewaldAlpha = (RealOpenMM) (std::sqrt(-std::log(ewaldErrorTol))/nonbondedCutoff);
+        RealOpenMM mx = periodicBoxSize[0]/nonbondedCutoff;
+        RealOpenMM my = periodicBoxSize[1]/nonbondedCutoff;
+        RealOpenMM mz = periodicBoxSize[2]/nonbondedCutoff;
+        RealOpenMM pi = (RealOpenMM) 3.1415926535897932385;
+        kmax[0] = (int)std::ceil(-(mx/pi)*std::log(ewaldErrorTol));
+        kmax[1] = (int)std::ceil(-(my/pi)*std::log(ewaldErrorTol));
+        kmax[2] = (int)std::ceil(-(mz/pi)*std::log(ewaldErrorTol));
+        if (kmax[0]%2 == 0)
+            kmax[0]++;
+        if (kmax[1]%2 == 0)
+            kmax[1]++;
+        if (kmax[2]%2 == 0)
+            kmax[2]++;
+    }
+    if (nonbondedMethod == Ewald || nonbondedMethod == PME) {
+        RealOpenMM ewaldErrorTol = (RealOpenMM) force.getEwaldErrorTolerance();
+        ewaldAlpha = (RealOpenMM) (std::sqrt(-std::log(ewaldErrorTol))/nonbondedCutoff);
+        RealOpenMM mx = periodicBoxSize[0]/nonbondedCutoff;
+        RealOpenMM my = periodicBoxSize[1]/nonbondedCutoff;
+        RealOpenMM mz = periodicBoxSize[2]/nonbondedCutoff;
+        RealOpenMM pi = (RealOpenMM) 3.1415926535897932385;
+        kmax[0] = (int)std::ceil(-(mx/pi)*std::log(ewaldErrorTol));
+        kmax[1] = (int)std::ceil(-(my/pi)*std::log(ewaldErrorTol));
+        kmax[2] = (int)std::ceil(-(mz/pi)*std::log(ewaldErrorTol));
+        if (kmax[0]%2 == 0)
+            kmax[0]++;
+        if (kmax[1]%2 == 0)
+            kmax[1]++;
+        if (kmax[2]%2 == 0)
+            kmax[2]++;
+    }
+#endif
+    rfDielectric = (RealOpenMM)force.getReactionFieldDielectric();
+}
+
+void ReferenceFreeEnergyCalcNonbondedSoftcoreForceKernel::executeForces(ContextImpl& context) {
+
+    RealOpenMM** posData   = extractPositions(context);
+    RealOpenMM** forceData = extractForces(context);
+
+    ReferenceFreeEnergyLJCoulombSoftcoreIxn clj;
+    //clj.setSoftCoreLJLambda( softCoreLJLambda );
+
+    bool periodic = (nonbondedMethod == CutoffPeriodic);
+    bool ewald    = (nonbondedMethod == Ewald);
+    bool pme      = (nonbondedMethod == PME);
+
+    if (nonbondedMethod != NoCutoff) {
+        computeNeighborListVoxelHash(*neighborList, numParticles, posData, exclusions, (periodic || ewald || pme) ? periodicBoxSize : NULL, nonbondedCutoff, 0.0);
+        clj.setUseCutoff(nonbondedCutoff, *neighborList, rfDielectric);
+    }
+
+    if (periodic||ewald||pme)
+        clj.setPeriodic(periodicBoxSize);
+
+    if (ewald)
+        clj.setUseEwald(ewaldAlpha, kmax[0], kmax[1], kmax[2]);
+
+    if (pme)
+        clj.setUsePME(ewaldAlpha);
+
+    clj.calculatePairIxn(numParticles, posData, particleParamArray, exclusionArray, 0, forceData, 0, 0);
+
+    ReferenceBondForce refBondForce;
+
+    ReferenceFreeEnergyLJCoulomb14Softcore nonbonded14;
+    if (nonbondedMethod == CutoffNonPeriodic || nonbondedMethod == CutoffPeriodic)
+        nonbonded14.setUseCutoff(nonbondedCutoff, rfDielectric);
+    refBondForce.calculateForce(num14, bonded14IndexArray, posData, bonded14ParamArray, forceData, 0, 0, 0, nonbonded14);
+}
+
+double ReferenceFreeEnergyCalcNonbondedSoftcoreForceKernel::executeEnergy(ContextImpl& context) {
+
+    RealOpenMM** posData   = extractPositions(context);
+    RealOpenMM** forceData = allocateRealArray(numParticles, 3);
+
+    RealOpenMM energy = 0;
+    ReferenceFreeEnergyLJCoulombSoftcoreIxn clj;
+   // clj.setSoftCoreLJLambda( softCoreLJLambda );
+    bool periodic = (nonbondedMethod == CutoffPeriodic);
+    bool ewald  = (nonbondedMethod == Ewald);
+    bool pme  = (nonbondedMethod == PME);
+    if (nonbondedMethod != NoCutoff) {
+        computeNeighborListVoxelHash(*neighborList, numParticles, posData, exclusions, (periodic || ewald || pme) ? periodicBoxSize : NULL, nonbondedCutoff, 0.0);
+        clj.setUseCutoff(nonbondedCutoff, *neighborList, rfDielectric);
+    }
+    if (periodic || ewald || pme)
+        clj.setPeriodic(periodicBoxSize);
+    if (ewald)
+        clj.setUseEwald(ewaldAlpha, kmax[0], kmax[1], kmax[2]);
+    if (pme)
+        clj.setUsePME(ewaldAlpha);
+    clj.calculatePairIxn(numParticles, posData, particleParamArray, exclusionArray, 0, forceData, 0, &energy);
+    ReferenceBondForce refBondForce;
+    ReferenceFreeEnergyLJCoulomb14Softcore nonbonded14;
+    if (nonbondedMethod == CutoffNonPeriodic || nonbondedMethod == CutoffPeriodic)
+        nonbonded14.setUseCutoff(nonbondedCutoff, rfDielectric);
+
+    RealOpenMM* energyArray = new RealOpenMM[num14];
+    for (int i = 0; i < num14; ++i)
+        energyArray[i] = 0;
+    refBondForce.calculateForce(num14, bonded14IndexArray, posData, bonded14ParamArray, forceData, energyArray, 0, &energy, nonbonded14);
+    disposeRealArray(forceData, numParticles);
+    delete[] energyArray;
+
+    return energy;
+}
+
+ReferenceFreeEnergyCalcGBSAOBCSoftcoreForceKernel::~ReferenceFreeEnergyCalcGBSAOBCSoftcoreForceKernel() {
+    if (obc) {
+        delete obc;
+    }
+}
+
+void ReferenceFreeEnergyCalcGBSAOBCSoftcoreForceKernel::initialize(const System& system, const GBSAOBCSoftcoreForce& force) {
+
+    int numParticles = system.getNumParticles();
+
+    charges.resize(numParticles);
+
+    std::vector<RealOpenMM> atomicRadii(numParticles);
+    std::vector<RealOpenMM> scaleFactors(numParticles);
+    std::vector<RealOpenMM> nonPolarScaleFactors(numParticles);
+
+    for (int i = 0; i < numParticles; ++i) {
+
+        double charge, radius, scalingFactor, nonPolarScaleFactor;
+        force.getParticleParameters(i, charge, radius, scalingFactor, nonPolarScaleFactor);
+
+        charges[i]              = static_cast<RealOpenMM>(charge);
+        atomicRadii[i]          = static_cast<RealOpenMM>(radius);
+        scaleFactors[i]         = static_cast<RealOpenMM>(scalingFactor);
+        nonPolarScaleFactors[i] = static_cast<RealOpenMM>(nonPolarScaleFactor);
+    }
+
+    ObcSoftcoreParameters* obcParameters  = new ObcSoftcoreParameters(numParticles, ObcSoftcoreParameters::ObcTypeII);
+
+    obcParameters->setAtomicRadii(atomicRadii);
+    obcParameters->setScaledRadiusFactors(scaleFactors);
+    obcParameters->setNonPolarScaleFactors(nonPolarScaleFactors);
+
+    obcParameters->setSolventDielectric( static_cast<RealOpenMM>(force.getSolventDielectric()) );
+    obcParameters->setSoluteDielectric(  static_cast<RealOpenMM>(force.getSoluteDielectric()) );
+    obcParameters->setNonPolarPrefactor( static_cast<RealOpenMM>(force.getNonPolarPrefactor()) );
+
+    // If there is a NonbondedForce in this system, use it to initialize cutoffs and periodic boundary conditions.
+
+    for (int i = 0; i < system.getNumForces(); i++) {
+        const NonbondedForce* nonbonded = dynamic_cast<const NonbondedForce*>(&system.getForce(i));
+        if (nonbonded != NULL) {
+            if (nonbonded->getNonbondedMethod() != NonbondedForce::NoCutoff)
+                obcParameters->setUseCutoff(static_cast<RealOpenMM>(nonbonded->getCutoffDistance()));
+            if (nonbonded->getNonbondedMethod() == NonbondedForce::CutoffPeriodic) {
+                Vec3 boxVectors[3];
+                system.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+                RealOpenMM periodicBoxSize[3];
+                periodicBoxSize[0] = (RealOpenMM) boxVectors[0][0];
+                periodicBoxSize[1] = (RealOpenMM) boxVectors[1][1];
+                periodicBoxSize[2] = (RealOpenMM) boxVectors[2][2];
+                obcParameters->setPeriodic(periodicBoxSize);
+            }
+            break;
+        }
+    }
+    obc = new CpuObcSoftcore(obcParameters);
+    obc->setIncludeAceApproximation(true);
+}
+
+void ReferenceFreeEnergyCalcGBSAOBCSoftcoreForceKernel::executeForces(ContextImpl& context) {
+    RealOpenMM** posData   = extractPositions(context);
+    RealOpenMM** forceData = extractForces(context);
+    obc->computeImplicitSolventForces(posData, &charges[0], forceData, 1);
+}
+
+double ReferenceFreeEnergyCalcGBSAOBCSoftcoreForceKernel::executeEnergy(ContextImpl& context) {
+    RealOpenMM** posData   = extractPositions(context);
+    RealOpenMM** forceData = extractForces(context);
+    obc->computeImplicitSolventForces(posData, &charges[0], forceData, 1);
+    return obc->getEnergy();
+}
+
+ReferenceFreeEnergyCalcGBVISoftcoreForceKernel::~ReferenceFreeEnergyCalcGBVISoftcoreForceKernel() {
+    if (gbviSoftcore) {
+        delete gbviSoftcore;
+    }
+}
+
+void ReferenceFreeEnergyCalcGBVISoftcoreForceKernel::initialize(const System& system, const GBVISoftcoreForce& force, const std::vector<double> & inputScaledRadii ) {
+
+    int numParticles = system.getNumParticles();
+
+    charges.resize(numParticles);
+    std::vector<RealOpenMM> atomicRadii(numParticles);
+    std::vector<RealOpenMM> scaledRadii(numParticles);
+    std::vector<RealOpenMM> gammas(numParticles);
+    std::vector<RealOpenMM> bornRadiusScaleFactors(numParticles);
+
+    for (int i = 0; i < numParticles; ++i) {
+        double charge, radius, gamma, bornRadiusScaleFactor;
+        force.getParticleParameters(i, charge, radius, gamma, bornRadiusScaleFactor);
+        charges[i]                = static_cast<RealOpenMM>(charge);
+        atomicRadii[i]            = static_cast<RealOpenMM>(radius);
+        gammas[i]                 = static_cast<RealOpenMM>(gamma);
+        scaledRadii[i]            = static_cast<RealOpenMM>(inputScaledRadii[i]);
+        bornRadiusScaleFactors[i] = static_cast<RealOpenMM>(bornRadiusScaleFactor);
+    }
+
+    GBVISoftcoreParameters* gBVIParameters = new GBVISoftcoreParameters(numParticles);
+    gBVIParameters->setAtomicRadii(atomicRadii);
+    gBVIParameters->setGammaParameters(gammas);
+    gBVIParameters->setBornRadiusScaleFactors(bornRadiusScaleFactors);
+    gBVIParameters->setScaledRadii(scaledRadii);
+
+    // switching function/scaling
+
+    // quintic spline
+
+    if( force.getBornRadiusScalingMethod() == GBVISoftcoreForce::QuinticSpline ){
+        gBVIParameters->setBornRadiusScalingSoftcoreMethod( GBVISoftcoreParameters::QuinticSpline );
+        gBVIParameters->setQuinticLowerLimitFactor(         static_cast<RealOpenMM>(force.getQuinticLowerLimitFactor()) );
+        gBVIParameters->setQuinticUpperBornRadiusLimit(     static_cast<RealOpenMM>(force.getQuinticUpperBornRadiusLimit()) );
+    }
+
+    gBVIParameters->setSolventDielectric( static_cast<RealOpenMM>(force.getSolventDielectric()) );
+    gBVIParameters->setSoluteDielectric( static_cast<RealOpenMM>(force.getSoluteDielectric()) );
+
+    if (force.getNonbondedMethod() != GBVISoftcoreForce::NoCutoff)
+        gBVIParameters->setUseCutoff(static_cast<RealOpenMM>(force.getCutoffDistance()));
+    if (force.getNonbondedMethod() == GBVISoftcoreForce::CutoffPeriodic) {
+        Vec3 boxVectors[3];
+        system.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+        RealOpenMM periodicBoxSize[3];
+        periodicBoxSize[0] = (RealOpenMM) boxVectors[0][0];
+        periodicBoxSize[1] = (RealOpenMM) boxVectors[1][1];
+        periodicBoxSize[2] = (RealOpenMM) boxVectors[2][2];
+        gBVIParameters->setPeriodic(periodicBoxSize);
+    }    
+    gbviSoftcore = new CpuGBVISoftcore(gBVIParameters);
+}
+
+void ReferenceFreeEnergyCalcGBVISoftcoreForceKernel::executeForces(ContextImpl& context) {
+
+    RealOpenMM** posData   = extractPositions(context);
+    RealOpenMM** forceData = extractForces(context);
+
+    RealOpenMM* bornRadii  = new RealOpenMM[context.getSystem().getNumParticles()];
+    gbviSoftcore->computeBornRadii(posData, bornRadii, NULL ); 
+    gbviSoftcore->computeBornForces(bornRadii, posData, &charges[0], forceData);
+    delete[] bornRadii;
+}
+
+double ReferenceFreeEnergyCalcGBVISoftcoreForceKernel::executeEnergy(ContextImpl& context) {
+    RealOpenMM** posData = extractPositions(context);
+
+    RealOpenMM* bornRadii = new RealOpenMM[context.getSystem().getNumParticles()];
+    gbviSoftcore->computeBornRadii(posData, bornRadii, NULL ); 
+    RealOpenMM energy     = gbviSoftcore->computeBornEnergy(bornRadii ,posData, &charges[0]);
+    delete[] bornRadii;
+    return static_cast<double>(energy);
+}
--- a/plugins/freeEnergy/platforms/reference/src/ReferenceFreeEnergyKernels.h
+++ b/plugins/freeEnergy/platforms/reference/src/ReferenceFreeEnergyKernels.h
+#ifndef OPENMM_REFERENCE_FREE_ENERGY_KERNELS_H_
+#define OPENMM_REFERENCE_FREE_ENERGY_KERNELS_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include "ReferencePlatform.h"
+#include "openmm/freeEnergyKernels.h"
+#include "SimTKUtilities/SimTKOpenMMRealType.h"
+#include "SimTKReference/ReferenceNeighborList.h"
+#include "gbsa/CpuGBVISoftcore.h"
+#include "gbsa/CpuObcSoftcore.h"
+
+namespace OpenMM {
+
+/**
+ * This kernel is invoked by NonbondedSoftcoreForce to calculate the forces acting on the system.
+ */
+class ReferenceFreeEnergyCalcNonbondedSoftcoreForceKernel : public CalcNonbondedSoftcoreForceKernel {
+public:
+    ReferenceFreeEnergyCalcNonbondedSoftcoreForceKernel(std::string name, const Platform& platform) : CalcNonbondedSoftcoreForceKernel(name, platform) {
+    }
+    ~ReferenceFreeEnergyCalcNonbondedSoftcoreForceKernel();
+    /**
+     * Initialize the kernel.
+     * 
+     * @param system     the System this kernel will be applied to
+     * @param force      the NonbondedSoftcoreForce this kernel will be used for
+     */
+    void initialize(const System& system, const NonbondedSoftcoreForce& force);
+    /**
+     * Execute the kernel to calculate the forces.
+     * 
+     * @param context    the context in which to execute this kernel
+     */
+    void executeForces(ContextImpl& context);
+    /**
+     * Execute the kernel to calculate the energy.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @return the potential energy due to the NonbondedSoftcoreForce
+     */
+    double executeEnergy(ContextImpl& context);
+private:
+    int numParticles, num14;
+    int **exclusionArray, **bonded14IndexArray;
+    RealOpenMM **particleParamArray, **bonded14ParamArray;
+    RealOpenMM nonbondedCutoff, periodicBoxSize[3], rfDielectric, ewaldAlpha;
+    int kmax[3];
+    std::vector<std::set<int> > exclusions;
+    NonbondedSoftcoreMethod nonbondedMethod;
+    NeighborList* neighborList;
+};
+
+/**
+ * This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
+ */
+class ReferenceFreeEnergyCalcGBSAOBCSoftcoreForceKernel : public CalcGBSAOBCSoftcoreForceKernel {
+public:
+    ReferenceFreeEnergyCalcGBSAOBCSoftcoreForceKernel(std::string name, const Platform& platform) : CalcGBSAOBCSoftcoreForceKernel(name, platform) {
+    }
+    ~ReferenceFreeEnergyCalcGBSAOBCSoftcoreForceKernel();
+    /**
+     * Initialize the kernel.
+     * 
+     * @param system     the System this kernel will be applied to
+     * @param force      the GBSAOBCSoftcoreForce this kernel will be used for
+     */
+    void initialize(const System& system, const GBSAOBCSoftcoreForce& force);
+    /**
+     * Execute the kernel to calculate the forces.
+     * 
+     * @param context    the context in which to execute this kernel
+     */
+    void executeForces(ContextImpl& context);
+    /**
+     * Execute the kernel to calculate the energy.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @return the potential energy due to the GBSAOBCSoftcoreForce
+     */
+    double executeEnergy(ContextImpl& context);
+private:
+    CpuObcSoftcore* obc;
+    std::vector<RealOpenMM> charges;
+};
+
+/**
+ * This kernel is invoked by GBVISoftcoreForce to calculate the forces acting on the system.
+ */
+class ReferenceFreeEnergyCalcGBVISoftcoreForceKernel : public CalcGBVISoftcoreForceKernel {
+public:
+    ReferenceFreeEnergyCalcGBVISoftcoreForceKernel(std::string name, const Platform& platform) : CalcGBVISoftcoreForceKernel(name, platform) {
+    }
+    ~ReferenceFreeEnergyCalcGBVISoftcoreForceKernel();
+    /**
+     * Initialize the kernel.
+     * 
+     * @param system       the System this kernel will be applied to
+     * @param force        the GBVISoftcoreForce this kernel will be used for
+     * @param scaled radii the scaled radii (Eq. 5 of Labute paper)
+     */
+    void initialize(const System& system, const GBVISoftcoreForce& force, const std::vector<double> & scaledRadii);
+    /**
+     * Execute the kernel to calculate the forces.
+     * 
+     * @param context    the context in which to execute this kernel
+     */
+    void executeForces(ContextImpl& context);
+    /**
+     * Execute the kernel to calculate the energy.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @return the potential energy due to the GBVISoftcoreForce
+     */
+    double executeEnergy(ContextImpl& context);
+private:
+    CpuGBVISoftcore* gbviSoftcore;
+    std::vector<RealOpenMM> charges;
+};
+
+
+} // namespace OpenMM
+
+#endif /*OPENMM_REFERENCE_FREE_ENERGY_KERNELS_H_*/
--- a/plugins/freeEnergy/platforms/reference/src/SimTKReference/ReferenceFreeEnergyLJCoulomb14Softcore.cpp
+++ b/plugins/freeEnergy/platforms/reference/src/SimTKReference/ReferenceFreeEnergyLJCoulomb14Softcore.cpp
+
+/* Portions copyright (c) 2006 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <string.h>
+#include <sstream>
+
+#include "../SimTKUtilities/SimTKOpenMMCommon.h"
+#include "../SimTKUtilities/SimTKOpenMMLog.h"
+#include "../SimTKUtilities/SimTKOpenMMUtilities.h"
+#include "ReferenceFreeEnergyLJCoulomb14Softcore.h"
+#include "ReferenceForce.h"
+
+/**---------------------------------------------------------------------------------------
+
+   ReferenceFreeEnergyLJCoulomb14Softcore constructor
+
+   --------------------------------------------------------------------------------------- */
+
+ReferenceFreeEnergyLJCoulomb14Softcore::ReferenceFreeEnergyLJCoulomb14Softcore( ) : cutoff(false) {
+
+   // ---------------------------------------------------------------------------------------
+
+   // static const char* methodName = "\nReferenceFreeEnergyLJCoulomb14Softcore::ReferenceFreeEnergyLJCoulomb14Softcore";
+
+   // ---------------------------------------------------------------------------------------
+
+}
+
+/**---------------------------------------------------------------------------------------
+
+   ReferenceFreeEnergyLJCoulomb14Softcore destructor
+
+   --------------------------------------------------------------------------------------- */
+
+ReferenceFreeEnergyLJCoulomb14Softcore::~ReferenceFreeEnergyLJCoulomb14Softcore( ){
+
+   // ---------------------------------------------------------------------------------------
+
+   // static const char* methodName = "\nReferenceFreeEnergyLJCoulomb14Softcore::~ReferenceFreeEnergyLJCoulomb14Softcore";
+
+   // ---------------------------------------------------------------------------------------
+
+}
+
+  /**---------------------------------------------------------------------------------------
+
+     Set the force to use a cutoff.
+
+     @param distance            the cutoff distance
+     @param solventDielectric   the dielectric constant of the bulk solvent
+
+     @return ReferenceForce::DefaultReturn
+
+     --------------------------------------------------------------------------------------- */
+
+  int ReferenceFreeEnergyLJCoulomb14Softcore::setUseCutoff( RealOpenMM distance, RealOpenMM solventDielectric ) {
+    
+    cutoff = true;
+    cutoffDistance = distance;
+    krf = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0f)/(2.0f*solventDielectric+1.0f);
+    crf = (1.0f/cutoffDistance)*(3.0f*solventDielectric)/(2.0f*solventDielectric+1.0f);
+            
+    return ReferenceForce::DefaultReturn;
+  }
+  
+/**---------------------------------------------------------------------------------------
+
+   Calculate parameters for LJ 1-4 ixn
+
+   @param c6               c6
+   @param c12              c12
+   @param q1               q1 charge atom 1
+   @param q2               q2 charge atom 2
+   @param epsfac           epsfac ????????????
+   @param parameters       output parameters:
+										parameter[0]= c6*c6/c12
+										parameter[1]= (c12/c6)**1/6
+										parameter[2]= epsfactor*q1*q2
+
+   @return ReferenceForce::DefaultReturn
+
+   --------------------------------------------------------------------------------------- */
+
+int ReferenceFreeEnergyLJCoulomb14Softcore::getDerivedParameters( RealOpenMM c6, RealOpenMM c12, RealOpenMM q1,
+                                                                  RealOpenMM q2, RealOpenMM epsfac,
+                                                                  RealOpenMM* parameters ) const {
+
+   // ---------------------------------------------------------------------------------------
+
+   // static const char* methodName = "\nReferenceFreeEnergyLJCoulomb14Softcore::getDerivedParameters";
+
+   static const RealOpenMM zero       =  0.0;
+   static const RealOpenMM one        =  1.0;
+   static const RealOpenMM six        =  6.0;
+   static const RealOpenMM oneSixth   =  one/six;
+
+   // ---------------------------------------------------------------------------------------
+
+   if( c12 <= zero ){
+      parameters[0] = one;
+      parameters[1] = zero;
+   } else {
+      parameters[0] = (c6*c6)/c12;
+      parameters[1] = POW( (c12/c6), oneSixth );
+   }
+   parameters[2] = epsfac*q1*q2;
+
+   return ReferenceForce::DefaultReturn;
+}
+
+/**---------------------------------------------------------------------------------------
+
+   Calculate LJ 1-4 ixn
+
+   @param atomIndices      atom indices of 4 atoms in bond
+   @param atomCoordinates  atom coordinates
+   @param parameters       three parameters:
+                                        parameters[0]= (c12/c6)**1/6  (sigma)
+										parameters[1]= c6*c6/c12      (4*epsilon)
+										parameters[2]= epsfac*q1*q2
+   @param forces           force array (forces added to current values)
+   @param energiesByBond   energies by bond: energiesByBond[bondIndex]
+   @param energiesByAtom   energies by atom: energiesByAtom[atomIndex]
+
+   @return ReferenceForce::DefaultReturn
+
+   --------------------------------------------------------------------------------------- */
+
+int ReferenceFreeEnergyLJCoulomb14Softcore::calculateBondIxn( int* atomIndices, RealOpenMM** atomCoordinates,
+                                                               RealOpenMM* parameters, RealOpenMM** forces,
+                                                               RealOpenMM* energiesByBond,
+                                                               RealOpenMM* energiesByAtom ) const {
+
+   // ---------------------------------------------------------------------------------------
+
+   // static const char* methodName = "\nReferenceFreeEnergyLJCoulomb14Softcore::calculateBondIxn";
+
+   // ---------------------------------------------------------------------------------------
+
+   static const std::string methodName = "\nReferenceFreeEnergyLJCoulomb14Softcore::calculateBondIxn";
+
+   // constants -- reduce Visual Studio warnings regarding conversions between float & double
+
+   static const RealOpenMM zero        =  0.0;
+   static const RealOpenMM one         =  1.0;
+   static const RealOpenMM two         =  2.0;
+   static const RealOpenMM three       =  3.0;
+   static const RealOpenMM six         =  6.0;
+   static const RealOpenMM twelve      = 12.0;
+   static const RealOpenMM oneM        = -1.0;
+
+   static const int threeI             = 3;
+
+   // number of parameters
+
+   static const int numberOfParameters = 3;
+
+   // debug flag
+
+   static const int debug              = 0;
+
+   static const int LastAtomIndex      = 2;
+
+   RealOpenMM deltaR[2][ReferenceForce::LastDeltaRIndex];
+
+   // ---------------------------------------------------------------------------------------
+
+   // get deltaR, R2, and R between 2 atoms
+
+   int atomAIndex = atomIndices[0];
+   int atomBIndex = atomIndices[1];
+   ReferenceForce::getDeltaR( atomCoordinates[atomBIndex], atomCoordinates[atomAIndex], deltaR[0] );  
+
+   if (cutoff && deltaR[0][ReferenceForce::RIndex] > cutoffDistance)
+       return ReferenceForce::DefaultReturn;
+   RealOpenMM r2        = deltaR[0][ReferenceForce::R2Index];
+   RealOpenMM inverseR  = one/(deltaR[0][ReferenceForce::RIndex]);
+
+   RealOpenMM sig                    = parameters[0];
+   RealOpenMM eps                    = parameters[1];
+   RealOpenMM minSoftCoreLJLambda    = parameters[3];
+   RealOpenMM energy                 = zero;
+   RealOpenMM dEdR                   = zero;
+
+   if( minSoftCoreLJLambda < one ){
+       calculateOneSoftCoreLJ14Ixn( deltaR[0][ReferenceForce::RIndex], sig, eps, minSoftCoreLJLambda, &dEdR, &energy );
+    } else {
+       calculateOneLJ14Ixn( inverseR, sig, eps, &dEdR, &energy );
+    }
+    if (cutoff)
+       dEdR += parameters[2]*(inverseR-2.0f*krf*r2);
+    else
+       dEdR += parameters[2]*inverseR;
+    dEdR     *= inverseR*inverseR;
+
+   // accumulate forces
+
+   for( int ii = 0; ii < 3; ii++ ){
+      RealOpenMM force        = dEdR*deltaR[0][ii];
+      forces[atomAIndex][ii] += force;
+      forces[atomBIndex][ii] -= force;
+   }
+
+   if (cutoff)
+       energy += parameters[2]*(inverseR+krf*r2-crf);
+   else
+       energy += parameters[2]*inverseR;
+
+   // accumulate energies
+
+   updateEnergy( energy, energiesByBond, LastAtomIndex, atomIndices, energiesByAtom );
+
+   // debug 
+
+   if( debug ){
+      static bool printHeader = false;
+      std::stringstream message;
+      message << methodName;
+      message << std::endl;
+      if( !printHeader  ){  
+         printHeader = true;
+         message << std::endl;
+         message << methodName.c_str() << " a0 k [c q p s] r1 r2  angle dt rp p[] dot cosine angle dEdR*r F[]" << std::endl;
+      }   
+
+      message << std::endl;
+      for( int ii = 0; ii < LastAtomIndex; ii++ ){
+         message << " Atm " << atomIndices[ii] << " [" << atomCoordinates[atomIndices[ii]][0] << " " << atomCoordinates[atomIndices[ii]][1] << "] ";
+      }
+      message << std::endl << " Delta:";
+      for( int ii = 0; ii < (LastAtomIndex - 1); ii++ ){
+         message << " [";
+         for( int jj = 0; jj < ReferenceForce::LastDeltaRIndex; jj++ ){
+            message << deltaR[ii][jj] << " ";
+         }
+         message << "]";
+      }
+      message << std::endl;
+
+      message << " p1="     << parameters[0];
+      message << " p2="     << parameters[1];
+      message << " p3="     << parameters[2];
+      message << std::endl << "  ";
+
+      message << " dEdR=" << dEdR;
+      message << " E=" << energy << " force factors: ";
+      message << "F=compute force; f=cumulative force";
+
+      message << std::endl << "  ";
+      for( int ii = 0; ii < LastAtomIndex; ii++ ){
+         message << " F" << (ii+1) << "[";
+         SimTKOpenMMUtilities::formatRealStringStream( message, deltaR[0], threeI, dEdR );
+         message << "]";
+      }   
+      message << std::endl << "  ";
+
+      for( int ii = 0; ii < LastAtomIndex; ii++ ){
+         message << " f" << (ii+1) << "[";
+         SimTKOpenMMUtilities::formatRealStringStream( message, forces[atomIndices[ii]], threeI );
+         message << "]";
+      }
+
+      SimTKOpenMMLog::printMessage( message );
+   }   
+
+   return ReferenceForce::DefaultReturn;
+}
+
+  /**---------------------------------------------------------------------------------------
+
+     Calculate LJ pair ixn between two atoms
+
+     @param inverseR         1/r
+     @param sig              sigma
+     @param eps              epsilon
+     @param dEdR             output force factor
+     @param energy           LJ energy
+
+     @return ReferenceForce::DefaultReturn
+
+     --------------------------------------------------------------------------------------- */
+
+int ReferenceFreeEnergyLJCoulomb14Softcore::calculateOneLJ14Ixn( RealOpenMM inverseR, RealOpenMM sig, RealOpenMM eps,
+                                                                 RealOpenMM* dEdR, RealOpenMM* energy ) const {
+
+    // ---------------------------------------------------------------------------------------
+
+    //static const std::string methodName = "\nReferenceLJ14CoulombIxn::calculateOneLJIxn";
+
+    // ---------------------------------------------------------------------------------------
+
+    // constants -- reduce Visual Studio warnings regarding conversions between float & double
+
+    static const RealOpenMM zero        =  0.0;
+    static const RealOpenMM one         =  1.0;
+    static const RealOpenMM six         =  6.0;
+    static const RealOpenMM twelve      = 12.0;
+
+    RealOpenMM sig2                     = inverseR*sig;
+               sig2                    *= sig2;
+    RealOpenMM sig6                     = sig2*sig2*sig2;
+              *dEdR                     = eps*( twelve*sig6 - six )*sig6;
+               
+        *energy                        += eps*(sig6-one)*sig6;
+
+    return ReferenceForce::DefaultReturn;
+}
+
+  /**---------------------------------------------------------------------------------------
+
+     Calculate softcore LJ pair ixn between two atoms
+
+     @param r                r
+     @param sig              sigma
+     @param eps              epsilon
+     @param lambda           lambda
+     @param dEdR             output force factor
+     @param energy           LJ energy
+
+     @return ReferenceForce::DefaultReturn
+
+     --------------------------------------------------------------------------------------- */
+
+int ReferenceFreeEnergyLJCoulomb14Softcore::calculateOneSoftCoreLJ14Ixn( RealOpenMM r, RealOpenMM sig, RealOpenMM eps,
+                                                                         RealOpenMM lambda,
+                                                                         RealOpenMM* dEdR, RealOpenMM* energy ) const {
+
+    // ---------------------------------------------------------------------------------------
+
+    //static const std::string methodName = "\nReferenceFreeEnergyLJCoulomb14Softcore::calculateOneSoftCoreLJ14Ixn";
+
+    // ---------------------------------------------------------------------------------------
+
+    // constants -- reduce Visual Studio warnings regarding conversions between float & double
+
+    static const RealOpenMM zero        =  0.0;
+    static const RealOpenMM one         =  1.0;
+    static const RealOpenMM six         =  6.0;
+    static const RealOpenMM twelve      = 12.0;
+    static const RealOpenMM alphaLJ     = 0.5;
+
+#if 0
+RealOpenMM dEdROrig = 0.0;
+RealOpenMM E_Orig   = 0.0;
+static int maxPrint = 0;
+calculateOneLJIxn( one/r, sig, eps, &dEdROrig, &E_Orig );
+#endif
+
+    // soft-core LJ energy = lambda*4*eps*[ 1/{alphaLJ*(1-lambda) + (r/sig)**6}**2 - 1/{alphaLJ*(1-lambda) + (r/sig)**6} ]
+
+    eps                                *= lambda;
+
+    RealOpenMM sig2                     = r/sig;
+               sig2                    *= sig2;
+    RealOpenMM sig6                     = sig2*sig2*sig2;
+
+    RealOpenMM softcoreLJTerm           = alphaLJ*(one -  lambda) + sig6;
+    RealOpenMM softcoreLJInv            = one/softcoreLJTerm;
+    RealOpenMM softcoreLJInv2           = softcoreLJInv*softcoreLJInv;
+
+    *dEdR                               = eps*softcoreLJInv2*( twelve*softcoreLJInv - six )*sig6;
+               
+    *energy                            += eps*softcoreLJInv*( softcoreLJInv - one );
+
+#if 0
+if( maxPrint++ < 5 ){
+   printf( "r=%14.6e sig=%14.6e eps=%14.6e lambda=%14.6e de[%14.6e %14.6e] e[%14.6e %14.6e] %14.6e %14.6e\n",
+           r, sig, eps/lambda, lambda, dEdROrig, *dEdR, E_Orig, *energy, softcoreLJInv, sig6 ); 
+}
+#endif
+    return ReferenceForce::DefaultReturn;
+}
--- a/plugins/freeEnergy/platforms/reference/src/SimTKReference/ReferenceFreeEnergyLJCoulomb14Softcore.h
+++ b/plugins/freeEnergy/platforms/reference/src/SimTKReference/ReferenceFreeEnergyLJCoulomb14Softcore.h
+
+/* Portions copyright (c) 2006 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __ReferenceFreeEnergyLJCoulomb14Softcore_H__
+#define __ReferenceFreeEnergyLJCoulomb14Softcore_H__
+
+#include "SimTKReference/ReferenceBondIxn.h"
+
+// ---------------------------------------------------------------------------------------
+
+class ReferenceFreeEnergyLJCoulomb14Softcore : public ReferenceBondIxn {
+
+   private:
+
+        bool cutoff;
+        RealOpenMM cutoffDistance;
+        RealOpenMM krf, crf;
+
+   public:
+
+      /**---------------------------------------------------------------------------------------
+      
+         Constructor
+      
+         --------------------------------------------------------------------------------------- */
+
+       ReferenceFreeEnergyLJCoulomb14Softcore( );
+
+      /**---------------------------------------------------------------------------------------
+      
+         Destructor
+      
+         --------------------------------------------------------------------------------------- */
+
+       ~ReferenceFreeEnergyLJCoulomb14Softcore( );
+
+      /**---------------------------------------------------------------------------------------
+      
+         Set the force to use a cutoff.
+      
+         @param distance            the cutoff distance
+         @param solventDielectric   the dielectric constant of the bulk solvent
+      
+         @return ReferenceForce::DefaultReturn
+      
+         --------------------------------------------------------------------------------------- */
+      
+      int setUseCutoff( RealOpenMM distance, RealOpenMM solventDielectric );
+       
+      /**---------------------------------------------------------------------------------------
+      
+         Calculate parameters for LJ 1-4 ixn
+      
+         @param c6               c6
+         @param c12              c12
+         @param q1               q1 charge atom 1
+         @param q2               q2 charge atom 2
+         @param epsfac           epsfac ????????????
+         @param parameters       output parameters:
+                                    parameter[0]= c6*c6/c12
+                                    parameter[1]= (c12/c6)**1/6
+                                    parameter[2]= epsfactor*q1*q2
+      
+         @return ReferenceForce::DefaultReturn
+      
+         --------------------------------------------------------------------------------------- */
+      
+      int getDerivedParameters( RealOpenMM c6, RealOpenMM c12, RealOpenMM q1, 
+                                               RealOpenMM q2, RealOpenMM epsfac,
+                                               RealOpenMM* parameters ) const;
+      
+      /**---------------------------------------------------------------------------------------
+      
+         Calculate Ryckaert-Bellemans bond ixn
+      
+         @param atomIndices      atom indices of 4 atoms in bond
+         @param atomCoordinates  atom coordinates
+         @param parameters       six RB parameters
+         @param forces           force array (forces added to current values)
+         @param energiesByBond   energies by bond: energiesByBond[bondIndex]
+         @param energiesByAtom   energies by atom: energiesByAtom[atomIndex]
+      
+         @return ReferenceForce::DefaultReturn
+            
+         --------------------------------------------------------------------------------------- */
+      
+      int calculateBondIxn( int* atomIndices, RealOpenMM** atomCoordinates,
+                            RealOpenMM* parameters, RealOpenMM** forces,
+                            RealOpenMM* energiesByBond, RealOpenMM* energiesByAtom ) const;
+      
+        /**---------------------------------------------------------------------------------------
+      
+           Calculate LJ pair ixn between two atoms
+      
+           @param inverseR         1/r
+           @param sig              sigma
+           @param eps              epsilon
+           @param dEdR             output force factor
+           @param energy           LJ energy
+      
+           @return ReferenceForce::DefaultReturn
+      
+           --------------------------------------------------------------------------------------- */
+      
+      int calculateOneLJ14Ixn( RealOpenMM inverseR, RealOpenMM sig, RealOpenMM eps,
+                               RealOpenMM* dEdR, RealOpenMM* energy ) const;
+      
+      
+        /**---------------------------------------------------------------------------------------
+      
+           Calculate softcore LJ pair ixn between two atoms
+      
+           @param r                r
+           @param sig              sigma
+           @param eps              epsilon
+           @param lambda           lambda
+           @param dEdR             output force factor
+           @param energy           LJ energy
+      
+           @return ReferenceForce::DefaultReturn
+      
+           --------------------------------------------------------------------------------------- */
+      
+      int calculateOneSoftCoreLJ14Ixn( RealOpenMM r, RealOpenMM sig, RealOpenMM eps,
+                                       RealOpenMM lambda, RealOpenMM* dEdR, RealOpenMM* energy ) const;
+      
+};
+
+// ---------------------------------------------------------------------------------------
+
+#endif // __ReferenceFreeEnergyLJCoulomb14Softcore_H__
--- a/plugins/freeEnergy/platforms/reference/src/SimTKReference/ReferenceFreeEnergyLJCoulombSoftcoreIxn.cpp
+++ b/plugins/freeEnergy/platforms/reference/src/SimTKReference/ReferenceFreeEnergyLJCoulombSoftcoreIxn.cpp
+
+/* Portions copyright (c) 2006 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <string.h>
+#include <sstream>
+#include <complex>
+
+#include "../SimTKUtilities/SimTKOpenMMCommon.h"
+#include "../SimTKUtilities/SimTKOpenMMLog.h"
+#include "../SimTKUtilities/SimTKOpenMMUtilities.h"
+#include "ReferenceFreeEnergyLJCoulombSoftcoreIxn.h"
+#include "ReferenceForce.h"
+#include "PME.h"
+
+// In case we're using some primitive version of Visual Studio this will
+// make sure that erf() and erfc() are defined.
+//#include "MSVC_erfc.h"
+
+using std::vector;
+
+/**---------------------------------------------------------------------------------------
+
+   ReferenceFreeEnergyLJCoulombSoftcoreIxn constructor
+
+   --------------------------------------------------------------------------------------- */
+
+ReferenceFreeEnergyLJCoulombSoftcoreIxn::ReferenceFreeEnergyLJCoulombSoftcoreIxn( ) : cutoff(false), periodic(false), ewald(false), pme(false), softCoreLJLambda(1.0) {
+
+   // ---------------------------------------------------------------------------------------
+
+   // static const char* methodName = "\nReferenceFreeEnergyLJCoulombSoftcoreIxn::ReferenceFreeEnergyLJCoulombSoftcoreIxn";
+
+   // ---------------------------------------------------------------------------------------
+
+}
+
+/**---------------------------------------------------------------------------------------
+
+   ReferenceFreeEnergyLJCoulombSoftcoreIxn destructor
+
+   --------------------------------------------------------------------------------------- */
+
+ReferenceFreeEnergyLJCoulombSoftcoreIxn::~ReferenceFreeEnergyLJCoulombSoftcoreIxn( ){
+
+   // ---------------------------------------------------------------------------------------
+
+   // static const char* methodName = "\nReferenceFreeEnergyLJCoulombSoftcoreIxn::~ReferenceFreeEnergyLJCoulombSoftcoreIxn";
+
+   // ---------------------------------------------------------------------------------------
+
+}
+
+  /**---------------------------------------------------------------------------------------
+
+     Set the force to use a cutoff.
+
+     @param distance            the cutoff distance
+     @param neighbors           the neighbor list to use
+     @param solventDielectric   the dielectric constant of the bulk solvent
+
+     @return ReferenceForce::DefaultReturn
+
+     --------------------------------------------------------------------------------------- */
+
+  int ReferenceFreeEnergyLJCoulombSoftcoreIxn::setUseCutoff( RealOpenMM distance, const OpenMM::NeighborList& neighbors, RealOpenMM solventDielectric ) {
+
+    cutoff = true;
+    cutoffDistance = distance;
+    neighborList = &neighbors;
+    krf = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0f)/(2.0f*solventDielectric+1.0f);
+    crf = (1.0f/cutoffDistance)*(3.0f*solventDielectric)/(2.0f*solventDielectric+1.0f);
+
+    return ReferenceForce::DefaultReturn;
+  }
+
+  /**---------------------------------------------------------------------------------------
+
+     Set the force to use periodic boundary conditions.  This requires that a cutoff has
+     also been set, and the smallest side of the periodic box is at least twice the cutoff
+     distance.
+
+     @param boxSize             the X, Y, and Z widths of the periodic box
+
+     @return ReferenceForce::DefaultReturn
+
+     --------------------------------------------------------------------------------------- */
+
+  int ReferenceFreeEnergyLJCoulombSoftcoreIxn::setPeriodic( RealOpenMM* boxSize ) {
+
+    assert(cutoff);
+    assert(boxSize[0] >= 2.0*cutoffDistance);
+    assert(boxSize[1] >= 2.0*cutoffDistance);
+    assert(boxSize[2] >= 2.0*cutoffDistance);
+    periodic = true;
+    periodicBoxSize[0] = boxSize[0];
+    periodicBoxSize[1] = boxSize[1];
+    periodicBoxSize[2] = boxSize[2];
+    return ReferenceForce::DefaultReturn;
+
+  }
+
+  /**---------------------------------------------------------------------------------------
+
+     Set the force to use Ewald summation.
+
+     @param alpha  the Ewald separation parameter
+     @param kmaxx  the largest wave vector in the x direction
+     @param kmaxy  the largest wave vector in the y direction
+     @param kmaxz  the largest wave vector in the z direction
+
+     --------------------------------------------------------------------------------------- */
+
+  void ReferenceFreeEnergyLJCoulombSoftcoreIxn::setUseEwald(RealOpenMM alpha, int kmaxx, int kmaxy, int kmaxz) {
+      alphaEwald = alpha;
+      numRx = kmaxx;
+      numRy = kmaxy;
+      numRz = kmaxz;
+      ewald = true;
+  }
+
+  /**---------------------------------------------------------------------------------------
+
+     Set the force to use Particle-Mesh Ewald (PME) summation.
+
+     @param alpha  the Ewald separation parameter
+
+     --------------------------------------------------------------------------------------- */
+
+  void ReferenceFreeEnergyLJCoulombSoftcoreIxn::setUsePME(RealOpenMM alpha) {
+      alphaEwald = alpha;
+      pme = true;
+  }
+
+
+  /**---------------------------------------------------------------------------------------
+
+     Set the soft core LJ lambda
+
+     @param lambda the soft core LJ lambda
+
+     --------------------------------------------------------------------------------------- */
+
+  void ReferenceFreeEnergyLJCoulombSoftcoreIxn::setSoftCoreLJLambda(RealOpenMM lambda) {
+      softCoreLJLambda = lambda;
+  }
+
+/**---------------------------------------------------------------------------------------
+
+   Calculate parameters for LJ Coulomb ixn
+
+   @param c6               c6
+   @param c12              c12
+   @param q1               q1 charge atom 1
+   @param epsfac           epsfacSqrt ????????????
+   @param parameters       output parameters:
+										parameter[SigIndex]  = 0.5*( (c12/c6)**1/6 ) (sigma/2)
+										parameter[EpsIndex]  = sqrt(c6*c6/c12)       (2*sqrt(epsilon))
+										parameter[QIndex]    = epsfactorSqrt*q1
+
+   @return ReferenceForce::DefaultReturn
+
+   --------------------------------------------------------------------------------------- */
+
+int ReferenceFreeEnergyLJCoulombSoftcoreIxn::getDerivedParameters( RealOpenMM c6, RealOpenMM c12, RealOpenMM q1,
+                                                                   RealOpenMM epsfacSqrt,
+                                                                   RealOpenMM* parameters ) const {
+
+   // ---------------------------------------------------------------------------------------
+
+   // static const char* methodName = "\nReferenceFreeEnergyLJCoulombSoftcoreIxn::getDerivedParameters";
+
+   static const RealOpenMM zero          =  0.0;
+   static const RealOpenMM one           =  1.0;
+   static const RealOpenMM six           =  6.0;
+   static const RealOpenMM half          =  0.5;
+   static const RealOpenMM oneSixth      =  one/six;
+   static const RealOpenMM oneTweleth    =  half*oneSixth;
+
+   // ---------------------------------------------------------------------------------------
+
+   if( c12 <= 0.0 ){
+
+      parameters[EpsIndex] = zero;
+      parameters[SigIndex] = half;
+
+   } else {
+
+      parameters[EpsIndex]    = c6*SQRT( one/c12 );
+
+      parameters[SigIndex]    = POW( (c12/c6), oneSixth );
+      parameters[SigIndex]   *= half;
+   }
+
+   parameters[QIndex]   = epsfacSqrt*q1;
+
+   return ReferenceForce::DefaultReturn;
+}
+
+/**---------------------------------------------------------------------------------------
+
+   Calculate Ewald ixn
+
+   @param numberOfAtoms    number of atoms
+   @param atomCoordinates  atom coordinates
+   @param atomParameters   atom parameters                             atomParameters[atomIndex][paramterIndex]
+   @param exclusions       atom exclusion indices                      exclusions[atomIndex][atomToExcludeIndex]
+                           exclusions[atomIndex][0] = number of exclusions
+                           exclusions[atomIndex][1-no.] = atom indices of atoms to excluded from
+                           interacting w/ atom atomIndex
+   @param fixedParameters  non atom parameters (not currently used)
+   @param forces           force array (forces added)
+   @param energyByAtom     atom energy
+   @param totalEnergy      total energy
+
+   @return ReferenceForce::DefaultReturn
+
+   --------------------------------------------------------------------------------------- */
+
+int ReferenceFreeEnergyLJCoulombSoftcoreIxn::calculateEwaldIxn( int numberOfAtoms, RealOpenMM** atomCoordinates,
+                                                                 RealOpenMM** atomParameters, int** exclusions,
+                                                                 RealOpenMM* fixedParameters, RealOpenMM** forces,
+                                                                 RealOpenMM* energyByAtom, RealOpenMM* totalEnergy) const {
+
+    #include "../SimTKUtilities/RealTypeSimTk.h"
+    typedef std::complex<RealOpenMM> d_complex;
+
+    static const RealOpenMM epsilon     =  1.0;
+    static const RealOpenMM one         =  1.0;
+    static const RealOpenMM six         =  6.0;
+    static const RealOpenMM twelve      = 12.0;
+
+    int kmax                            = (ewald ? std::max(numRx, std::max(numRy,numRz)) : 0);
+    RealOpenMM  factorEwald             = -1 / (4*alphaEwald*alphaEwald);
+    RealOpenMM SQRT_PI                  = sqrt(PI);
+    RealOpenMM TWO_PI                   = 2.0 * PI;
+    RealOpenMM recipCoeff               = (RealOpenMM)(4*PI/(periodicBoxSize[0] * periodicBoxSize[1] * periodicBoxSize[2]) /epsilon);
+
+    RealOpenMM totalSelfEwaldEnergy     = 0.0;
+    RealOpenMM realSpaceEwaldEnergy     = 0.0;
+    RealOpenMM recipEnergy              = 0.0;
+    RealOpenMM totalRecipEnergy         = 0.0;
+    RealOpenMM vdwEnergy                = 0.0;
+
+// **************************************************************************************
+// SELF ENERGY
+// **************************************************************************************
+
+    for( int atomID = 0; atomID < numberOfAtoms; atomID++ ){
+        RealOpenMM selfEwaldEnergy       = atomParameters[atomID][QIndex]*atomParameters[atomID][QIndex] * alphaEwald/SQRT_PI;
+        totalSelfEwaldEnergy            -= selfEwaldEnergy;
+
+        if( energyByAtom ){
+           energyByAtom[atomID]         -= selfEwaldEnergy;
+        }
+    }
+
+    if( totalEnergy ){
+        *totalEnergy += totalSelfEwaldEnergy;
+    }
+
+// **************************************************************************************
+// RECIPROCAL SPACE EWALD ENERGY AND FORCES
+// **************************************************************************************
+    // PME
+
+  if (pme) {
+	pme_t          pmedata; /* abstract handle for PME data */
+	int            ngrid[3];
+	RealOpenMM virial[3][3];
+
+	/* PME grid dimensions.
+	 * We typically want to set this as the spacing rather than absolute dimensions, but
+	 * to be able to reproduce results from other programs (e.g. Gromacs) we need to be
+	 * able to set exact grid dimenisions occasionally.
+	 */
+	ngrid[0] = 16;
+	ngrid[1] = 16;
+	ngrid[2] = 16;
+
+	pme_init(&pmedata,alphaEwald,numberOfAtoms,ngrid,4,1);
+
+	pme_exec(pmedata,atomCoordinates,forces,atomParameters,periodicBoxSize,&recipEnergy,virial);
+
+	if( totalEnergy )
+       *totalEnergy += recipEnergy;
+
+    if( energyByAtom )
+        for(int n = 0; n < numberOfAtoms; n++)
+            energyByAtom[n] += recipEnergy;
+
+        pme_destroy(pmedata);
+  }
+
+    // Ewald method
+
+  else if (ewald) {
+
+    // setup reciprocal box
+
+           RealOpenMM recipBoxSize[3] = { TWO_PI / periodicBoxSize[0], TWO_PI / periodicBoxSize[1], TWO_PI / periodicBoxSize[2]};
+
+
+    // setup K-vectors
+
+  #define EIR(x, y, z) eir[(x)*numberOfAtoms*3+(y)*3+z]
+  vector<d_complex> eir(kmax*numberOfAtoms*3);
+  vector<d_complex> tab_xy(numberOfAtoms);
+  vector<d_complex> tab_qxyz(numberOfAtoms);
+
+  if (kmax < 1) {
+      std::stringstream message;
+      message << " kmax < 1 , Aborting" << std::endl;
+      SimTKOpenMMLog::printError( message );
+  }
+
+  for(int i = 0; (i < numberOfAtoms); i++) {
+    for(int m = 0; (m < 3); m++)
+      EIR(0, i, m) = d_complex(1,0);
+
+    for(int m=0; (m<3); m++)
+      EIR(1, i, m) = d_complex(cos(atomCoordinates[i][m]*recipBoxSize[m]),
+                               sin(atomCoordinates[i][m]*recipBoxSize[m]));
+
+    for(int j=2; (j<kmax); j++)
+      for(int m=0; (m<3); m++)
+        EIR(j, i, m) = EIR(j-1, i, m) * EIR(1, i, m);
+  }
+
+    // calculate reciprocal space energy and forces
+
+    int lowry = 0;
+    int lowrz = 1;
+
+    for(int rx = 0; rx < numRx; rx++) {
+
+      RealOpenMM kx = rx * recipBoxSize[0];
+
+      for(int ry = lowry; ry < numRy; ry++) {
+
+        RealOpenMM ky = ry * recipBoxSize[1];
+
+        if(ry >= 0) {
+          for(int n = 0; n < numberOfAtoms; n++)
+            tab_xy[n] = EIR(rx, n, 0) * EIR(ry, n, 1);
+        }
+
+        else {
+          for(int n = 0; n < numberOfAtoms; n++)
+            tab_xy[n]= EIR(rx, n, 0) * conj (EIR(-ry, n, 1));
+        }
+
+        for (int rz = lowrz; rz < numRz; rz++) {
+
+          if( rz >= 0) {
+           for( int n = 0; n < numberOfAtoms; n++)
+             tab_qxyz[n] = atomParameters[n][QIndex] * (tab_xy[n] * EIR(rz, n, 2));
+          }
+
+          else {
+            for( int n = 0; n < numberOfAtoms; n++)
+              tab_qxyz[n] = atomParameters[n][QIndex] * (tab_xy[n] * conj(EIR(-rz, n, 2)));
+          }
+
+          RealOpenMM cs = 0.0f;
+          RealOpenMM ss = 0.0f;
+
+          for( int n = 0; n < numberOfAtoms; n++) {
+            cs += tab_qxyz[n].real();
+            ss += tab_qxyz[n].imag();
+          }
+
+          RealOpenMM kz = rz * recipBoxSize[2];
+          RealOpenMM k2 = kx * kx + ky * ky + kz * kz;
+          RealOpenMM ak = exp(k2*factorEwald) / k2;
+
+          for(int n = 0; n < numberOfAtoms; n++) {
+            RealOpenMM force = ak * (cs * tab_qxyz[n].imag() - ss * tab_qxyz[n].real());
+            forces[n][0] += 2 * recipCoeff * force * kx ;
+            forces[n][1] += 2 * recipCoeff * force * ky ;
+            forces[n][2] += 2 * recipCoeff * force * kz ;
+          }
+
+          recipEnergy       = recipCoeff * ak * ( cs * cs + ss * ss);
+          totalRecipEnergy += recipEnergy;
+
+          if( totalEnergy )
+             *totalEnergy += recipEnergy;
+
+          if( energyByAtom )
+             for(int n = 0; n < numberOfAtoms; n++)
+               energyByAtom[n] += recipEnergy;
+
+          lowrz = 1 - numRz;
+        }
+        lowry = 1 - numRy;
+      }
+    }
+  }
+
+  else {
+      std::stringstream message;
+      message << " Wrong method for Ewald summation, Aborting" << std::endl;
+      SimTKOpenMMLog::printError( message );
+  }
+
+
+// **************************************************************************************
+// SHORT-RANGE ENERGY AND FORCES
+// **************************************************************************************
+
+    RealOpenMM totalVdwEnergy            = 0.0f;
+    RealOpenMM totalRealSpaceEwaldEnergy = 0.0f;
+
+    for (int i = 0; i < (int) neighborList->size(); i++) {
+       OpenMM::AtomPair pair = (*neighborList)[i];
+       int ii = pair.first;
+       int jj = pair.second;
+
+       RealOpenMM deltaR[2][ReferenceForce::LastDeltaRIndex];
+       ReferenceForce::getDeltaRPeriodic( atomCoordinates[jj], atomCoordinates[ii], periodicBoxSize, deltaR[0] );
+       RealOpenMM r         = deltaR[0][ReferenceForce::RIndex];
+       RealOpenMM r2        = deltaR[0][ReferenceForce::R2Index];
+       RealOpenMM inverseR  = one/(deltaR[0][ReferenceForce::RIndex]);
+       RealOpenMM alphaR    = alphaEwald * r;
+
+
+       RealOpenMM dEdR      = atomParameters[ii][QIndex] * atomParameters[jj][QIndex] * inverseR * inverseR * inverseR;
+                  dEdR      = (RealOpenMM)(dEdR * (erfc(alphaR) + 2 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI ));
+
+       RealOpenMM sig       = atomParameters[ii][SigIndex] +  atomParameters[jj][SigIndex];
+       RealOpenMM sig2      = inverseR*sig;
+                  sig2     *= sig2;
+       RealOpenMM sig6      = sig2*sig2*sig2;
+       RealOpenMM eps       = atomParameters[ii][EpsIndex]*atomParameters[jj][EpsIndex];
+                  dEdR     += eps*( twelve*sig6 - six )*sig6*inverseR*inverseR;
+
+       // accumulate forces
+
+       for( int kk = 0; kk < 3; kk++ ){
+          RealOpenMM force  = dEdR*deltaR[0][kk];
+          forces[ii][kk]   += force;
+          forces[jj][kk]   -= force;
+       }
+
+       // accumulate energies
+
+       realSpaceEwaldEnergy        = (RealOpenMM) (atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR*erfc(alphaR));
+       vdwEnergy                   = eps*(sig6-one)*sig6;
+
+       totalVdwEnergy             += vdwEnergy;
+       totalRealSpaceEwaldEnergy  += realSpaceEwaldEnergy;
+
+        if( energyByAtom ){
+           energyByAtom[ii] += realSpaceEwaldEnergy + vdwEnergy;
+           energyByAtom[jj] += realSpaceEwaldEnergy + vdwEnergy;
+        }
+
+    }
+
+    if( totalEnergy )
+        *totalEnergy += totalRealSpaceEwaldEnergy + totalVdwEnergy;
+
+    // Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
+
+    RealOpenMM totalExclusionEnergy = 0.0f;
+    for (int i = 0; i < numberOfAtoms; i++)
+        for (int j = 1; j <= exclusions[i][0]; j++)
+            if (exclusions[i][j] > i) {
+               int ii = i;
+               int jj = exclusions[i][j];
+
+               RealOpenMM deltaR[2][ReferenceForce::LastDeltaRIndex];
+               ReferenceForce::getDeltaRPeriodic( atomCoordinates[jj], atomCoordinates[ii], periodicBoxSize, deltaR[0] );
+               RealOpenMM r         = deltaR[0][ReferenceForce::RIndex];
+               RealOpenMM inverseR  = one/(deltaR[0][ReferenceForce::RIndex]);
+               RealOpenMM alphaR    = alphaEwald * r;
+               RealOpenMM dEdR      = atomParameters[ii][QIndex] * atomParameters[jj][QIndex] * inverseR * inverseR * inverseR;
+                          dEdR      = (RealOpenMM)(dEdR * (erf(alphaR) - 2 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI ));
+
+               // accumulate forces
+
+               for( int kk = 0; kk < 3; kk++ ){
+                  RealOpenMM force  = dEdR*deltaR[0][kk];
+                  forces[ii][kk]   -= force;
+                  forces[jj][kk]   += force;
+               }
+
+               // accumulate energies
+
+               realSpaceEwaldEnergy = (RealOpenMM) (atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR*erf(alphaR));
+
+               totalExclusionEnergy += realSpaceEwaldEnergy;
+               if( energyByAtom ){
+                   energyByAtom[ii] -= realSpaceEwaldEnergy;
+                   energyByAtom[jj] -= realSpaceEwaldEnergy;
+               }
+            }
+
+    if( totalEnergy )
+        *totalEnergy -= totalExclusionEnergy;
+
+   // ***********************************************************************
+
+   return ReferenceForce::DefaultReturn;
+}
+
+/**---------------------------------------------------------------------------------------
+
+   Calculate PME ixn
+
+   @param numberOfAtoms    number of atoms
+   @param atomCoordinates  atom coordinates
+   @param atomParameters   atom parameters                             atomParameters[atomIndex][paramterIndex]
+   @param exclusions       atom exclusion indices                      exclusions[atomIndex][atomToExcludeIndex]
+                           exclusions[atomIndex][0] = number of exclusions
+                           exclusions[atomIndex][1-no.] = atom indices of atoms to excluded from
+                           interacting w/ atom atomIndex
+   @param fixedParameters  non atom parameters (not currently used)
+   @param forces           force array (forces added)
+   @param energyByAtom     atom energy
+   @param totalEnergy      total energy
+
+   @return ReferenceForce::DefaultReturn
+      
+   --------------------------------------------------------------------------------------- */
+ 
+int ReferenceFreeEnergyLJCoulombSoftcoreIxn::calculatePMEIxn( int numberOfAtoms, RealOpenMM** atomCoordinates,
+                                                              RealOpenMM** atomParameters, int** exclusions,
+                                                              RealOpenMM* fixedParameters, RealOpenMM** forces,
+                                                              RealOpenMM* energyByAtom, RealOpenMM* totalEnergy) const {
+
+
+
+    RealOpenMM SQRT_PI = sqrt(PI);
+    static const RealOpenMM one         =  1.0;
+
+    RealOpenMM selfEwaldEnergy = 0.0;
+    RealOpenMM realSpaceEwaldEnergy = 0.0;
+    RealOpenMM recipEnergy = 0.0;
+
+
+// **************************************************************************************
+// SELF ENERGY
+// **************************************************************************************
+
+    for( int atomID = 0; atomID < numberOfAtoms; atomID++ ){
+        selfEwaldEnergy = selfEwaldEnergy + atomParameters[atomID][QIndex]*atomParameters[atomID][QIndex];
+    }
+       selfEwaldEnergy = selfEwaldEnergy * alphaEwald/SQRT_PI ;
+
+// **************************************************************************************
+// RECIPROCAL SPACE EWALD ENERGY AND FORCES
+// **************************************************************************************
+
+	pme_t          pmedata; /* abstract handle for PME data */
+	int            ngrid[3];
+	RealOpenMM virial[3][3];
+	
+	/* PME grid dimensions.
+	 * We typically want to set this as the spacing rather than absolute dimensions, but
+	 * to be able to reproduce results from other programs (e.g. Gromacs) we need to be
+	 * able to set exact grid dimenisions occasionally.
+	 */
+	ngrid[0] = 16;
+	ngrid[1] = 16;
+	ngrid[2] = 16;
+	
+	pme_init(&pmedata,alphaEwald,numberOfAtoms,ngrid,4,1);
+	
+	pme_exec(pmedata,atomCoordinates,forces,atomParameters,periodicBoxSize,&recipEnergy,virial);
+	
+// **************************************************************************************
+// SHORT-RANGE ENERGY AND FORCES
+// **************************************************************************************
+
+       RealOpenMM deltaR[2][ReferenceForce::LastDeltaRIndex];
+
+       for( int atomID1 = 0; atomID1 < numberOfAtoms; atomID1++ ){
+        for( int atomID2 = atomID1 + 1; atomID2 < numberOfAtoms; atomID2++ ){
+
+          ReferenceForce::getDeltaRPeriodic( atomCoordinates[atomID2], atomCoordinates[atomID1], periodicBoxSize, deltaR[0] );  
+          RealOpenMM r         = deltaR[0][ReferenceForce::RIndex];
+          RealOpenMM r2        = deltaR[0][ReferenceForce::R2Index];
+          RealOpenMM inverseR  = one/(deltaR[0][ReferenceForce::RIndex]);
+
+          realSpaceEwaldEnergy = 
+              (RealOpenMM)(realSpaceEwaldEnergy + atomParameters[atomID1][QIndex]*atomParameters[atomID2][QIndex]*inverseR*erfc(alphaEwald*r)); 
+        }
+       }
+
+       // allocate and initialize exclusion array
+
+       vector<int> exclusionIndices(numberOfAtoms);
+       for( int ii = 0; ii < numberOfAtoms; ii++ ){
+          exclusionIndices[ii] = -1;
+       }
+
+       for( int ii = 0; ii < numberOfAtoms; ii++ ){
+
+          // set exclusions
+
+          for( int jj = 1; jj <= exclusions[ii][0]; jj++ ){
+             exclusionIndices[exclusions[ii][jj]] = ii;
+          }
+
+          // loop over atom pairs
+
+          for( int jj = ii+1; jj < numberOfAtoms; jj++ ){
+
+             if( exclusionIndices[jj] != ii ){
+
+       ReferenceForce::getDeltaRPeriodic( atomCoordinates[jj], atomCoordinates[ii], periodicBoxSize, deltaR[0] );  
+       RealOpenMM r         = deltaR[0][ReferenceForce::RIndex];
+       RealOpenMM r2        = deltaR[0][ReferenceForce::R2Index];
+       RealOpenMM inverseR  = one/(deltaR[0][ReferenceForce::RIndex]);
+       RealOpenMM alphaR    = alphaEwald * r;
+ 
+       realSpaceEwaldEnergy = 
+           (RealOpenMM)(realSpaceEwaldEnergy + atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR*erfc(alphaR)); 
+       RealOpenMM dEdR = atomParameters[ii][QIndex] * atomParameters[jj][QIndex] * inverseR * inverseR * inverseR;
+       dEdR = (RealOpenMM)(dEdR * (erfc(alphaR) + 2 * alphaR * exp ( - alphaR * alphaR) / SQRT_PI ));
+ 
+                for( int kk = 0; kk < 3; kk++ ){
+                   RealOpenMM force  = dEdR*deltaR[0][kk];
+                   forces[ii][kk]   += force;
+                   forces[jj][kk]   -= force;
+                } 
+             }
+          }
+       }
+
+// ***********************************************************************
+
+
+   return ReferenceForce::DefaultReturn;
+}
+
+
+/**---------------------------------------------------------------------------------------
+
+   Calculate LJ Coulomb pair ixn
+
+   @param numberOfAtoms    number of atoms
+   @param atomCoordinates  atom coordinates
+   @param atomParameters   atom parameters                             atomParameters[atomIndex][paramterIndex]
+   @param exclusions       atom exclusion indices                      exclusions[atomIndex][atomToExcludeIndex]
+                           exclusions[atomIndex][0] = number of exclusions
+                           exclusions[atomIndex][1-no.] = atom indices of atoms to excluded from
+                           interacting w/ atom atomIndex
+   @param fixedParameters  non atom parameters (not currently used)
+   @param forces           force array (forces added)
+   @param energyByAtom     atom energy
+   @param totalEnergy      total energy
+
+   @return ReferenceForce::DefaultReturn
+
+   --------------------------------------------------------------------------------------- */
+
+int ReferenceFreeEnergyLJCoulombSoftcoreIxn::calculatePairIxn( int numberOfAtoms, RealOpenMM** atomCoordinates,
+                                             RealOpenMM** atomParameters, int** exclusions,
+                                             RealOpenMM* fixedParameters, RealOpenMM** forces,
+                                             RealOpenMM* energyByAtom, RealOpenMM* totalEnergy ) const {
+
+   if (ewald || pme)
+        return calculateEwaldIxn(numberOfAtoms, atomCoordinates, atomParameters, exclusions, fixedParameters, forces, energyByAtom, totalEnergy);
+   if (cutoff) {
+       for (int i = 0; i < (int) neighborList->size(); i++) {
+           OpenMM::AtomPair pair = (*neighborList)[i];
+           calculateOneIxn(pair.first, pair.second, atomCoordinates, atomParameters, forces, energyByAtom, totalEnergy);
+       }
+   }
+   else {
+       // allocate and initialize exclusion array
+
+       int* exclusionIndices = new int[numberOfAtoms];
+       for( int ii = 0; ii < numberOfAtoms; ii++ ){
+          exclusionIndices[ii] = -1;
+       }
+
+       for( int ii = 0; ii < numberOfAtoms; ii++ ){
+
+          // set exclusions
+
+          for( int jj = 1; jj <= exclusions[ii][0]; jj++ ){
+             exclusionIndices[exclusions[ii][jj]] = ii;
+          }
+
+          // loop over atom pairs
+
+          for( int jj = ii+1; jj < numberOfAtoms; jj++ ){
+
+             if( exclusionIndices[jj] != ii ){
+                 calculateOneIxn(ii, jj, atomCoordinates, atomParameters, forces, energyByAtom, totalEnergy);
+             }
+          }
+       }
+
+       delete[] exclusionIndices;
+   }
+
+   return ReferenceForce::DefaultReturn;
+}
+
+  /**---------------------------------------------------------------------------------------
+
+     Calculate LJ Coulomb pair ixn between two atoms
+
+     @param ii               the index of the first atom
+     @param jj               the index of the second atom
+     @param atomCoordinates  atom coordinates
+     @param atomParameters   atom parameters (charges, c6, c12, ...)     atomParameters[atomIndex][paramterIndex]
+     @param forces           force array (forces added)
+     @param energyByAtom     atom energy
+     @param totalEnergy      total energy
+
+     @return ReferenceForce::DefaultReturn
+
+     --------------------------------------------------------------------------------------- */
+
+int ReferenceFreeEnergyLJCoulombSoftcoreIxn::calculateOneIxn( int ii, int jj, RealOpenMM** atomCoordinates,
+                        RealOpenMM** atomParameters, RealOpenMM** forces,
+                        RealOpenMM* energyByAtom, RealOpenMM* totalEnergy ) const {
+
+    // ---------------------------------------------------------------------------------------
+
+    static const std::string methodName = "\nReferenceFreeEnergyLJCoulombSoftcoreIxn::calculateOneIxn";
+
+    // ---------------------------------------------------------------------------------------
+
+    // constants -- reduce Visual Studio warnings regarding conversions between float & double
+
+    static const RealOpenMM zero        =  0.0;
+    static const RealOpenMM one         =  1.0;
+    static const RealOpenMM two         =  2.0;
+    static const RealOpenMM three       =  3.0;
+    static const RealOpenMM six         =  6.0;
+    static const RealOpenMM twelve      = 12.0;
+    static const RealOpenMM oneM        = -1.0;
+
+    static const int threeI             = 3;
+
+    // debug flag
+
+    static const int debug              = -1;
+
+    static const int LastAtomIndex      = 2;
+
+    RealOpenMM deltaR[2][ReferenceForce::LastDeltaRIndex];
+
+    // get deltaR, R2, and R between 2 atoms
+
+    if (periodic)
+        ReferenceForce::getDeltaRPeriodic( atomCoordinates[jj], atomCoordinates[ii], periodicBoxSize, deltaR[0] );
+    else
+        ReferenceForce::getDeltaR( atomCoordinates[jj], atomCoordinates[ii], deltaR[0] );
+
+    RealOpenMM r2                     = deltaR[0][ReferenceForce::R2Index];
+    RealOpenMM inverseR               = one/(deltaR[0][ReferenceForce::RIndex]);
+    RealOpenMM sig                    = atomParameters[ii][SigIndex] +  atomParameters[jj][SigIndex];
+    RealOpenMM eps                    = atomParameters[ii][EpsIndex]*atomParameters[jj][EpsIndex];
+    RealOpenMM minSoftCoreLJLambda    = atomParameters[ii][SoftCoreLJLambdaIndex] < atomParameters[jj][SoftCoreLJLambdaIndex] ?
+                                        atomParameters[ii][SoftCoreLJLambdaIndex] : atomParameters[jj][SoftCoreLJLambdaIndex];
+
+    // LJ: use soft core LJ if lambda < 1
+
+    RealOpenMM energy          = zero;
+    RealOpenMM dEdR;
+
+    if( minSoftCoreLJLambda < one ){
+       calculateOneSoftCoreLJIxn( deltaR[0][ReferenceForce::RIndex], sig, eps, minSoftCoreLJLambda, &dEdR, &energy );
+    } else {
+       calculateOneLJIxn( inverseR, sig, eps, &dEdR, &energy );
+    }
+
+    // Coulomb
+
+    if (cutoff)
+       dEdR       += atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*(inverseR-2.0f*krf*r2);
+    else
+       dEdR       += atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR;
+
+    dEdR          *= inverseR*inverseR;
+
+    // accumulate forces
+
+    for( int kk = 0; kk < 3; kk++ ){
+       RealOpenMM force  = dEdR*deltaR[0][kk];
+       forces[ii][kk]   += force;
+       forces[jj][kk]   -= force;
+    }
+
+    // accumulate energies
+
+    if( totalEnergy || energyByAtom ) {
+        if (cutoff)
+            energy += atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*(inverseR+krf*r2-crf);
+        else
+            energy += atomParameters[ii][QIndex]*atomParameters[jj][QIndex]*inverseR;
+
+        if( totalEnergy )
+           *totalEnergy += energy;
+        if( energyByAtom ){
+           energyByAtom[ii] += energy;
+           energyByAtom[jj] += energy;
+        }
+    }
+
+    // debug
+
+    if( debug == ii ){
+       static bool printHeader = false;
+       std::stringstream message;
+       message << methodName;
+       message << std::endl;
+       int pairArray[2] = { ii, jj };
+       if( !printHeader  ){
+          printHeader = true;
+          message << std::endl;
+          message << methodName.c_str() << " a0 k [c q p s] r1 r2  angle dt rp p[] dot cosine angle dEdR*r F[]" << std::endl;
+       }
+
+       message << std::endl;
+       for( int kk = 0; kk < 2; kk++ ){
+          message << " Atm " << pairArray[kk] << " [" << atomCoordinates[pairArray[kk]][0] << " " << atomCoordinates[pairArray[kk]][1] << " " << atomCoordinates[pairArray[kk]][2] << "] ";
+       }
+       message << std::endl << " Delta:";
+       for( int kk = 0; kk < (LastAtomIndex - 1); kk++ ){
+          message << " [";
+          for( int jj = 0; jj < ReferenceForce::LastDeltaRIndex; jj++ ){
+             message << deltaR[kk][jj] << " ";
+          }
+          message << "]";
+       }
+       message << std::endl;
+
+       for( int kk = 0; kk < 2; kk++ ){
+          message << " p" << pairArray[kk] << " [";
+          message << atomParameters[pairArray[kk]][0] << " " << atomParameters[pairArray[kk]][1] << " " << atomParameters[pairArray[kk]][2];
+          message << "]";
+       }
+      message << std::endl;
+
+       message << " dEdR=" << dEdR;
+       message << " E=" << energy << " force factors: ";
+       message << "F=compute force; f=cumulative force";
+
+       message << std::endl << "  ";
+       message << " f" << ii << "[";
+       SimTKOpenMMUtilities::formatRealStringStream( message, deltaR[0], threeI, dEdR );
+       message << "]";
+
+       for( int kk = 0; kk < 2; kk++ ){
+          message << " F" <<  pairArray[kk] << " [";
+          SimTKOpenMMUtilities::formatRealStringStream( message, forces[pairArray[kk]], threeI );
+          message << "]";
+       }
+
+       SimTKOpenMMLog::printMessage( message );
+    }
+    return ReferenceForce::DefaultReturn;
+  }
+
+  /**---------------------------------------------------------------------------------------
+
+     Calculate LJ pair ixn between two atoms
+
+     @param inverseR         1/r
+     @param sig              sigma
+     @param eps              epsilon
+     @param dEdR             output force factor
+     @param energy           LJ energy
+
+     @return ReferenceForce::DefaultReturn
+
+     --------------------------------------------------------------------------------------- */
+
+int ReferenceFreeEnergyLJCoulombSoftcoreIxn::calculateOneLJIxn( RealOpenMM inverseR, RealOpenMM sig, RealOpenMM eps,
+                        RealOpenMM* dEdR, RealOpenMM* energy ) const {
+
+    // ---------------------------------------------------------------------------------------
+
+    //static const std::string methodName = "\nReferenceFreeEnergyLJCoulombSoftcoreIxn::calculateOneLJIxn";
+
+    // ---------------------------------------------------------------------------------------
+
+    // constants -- reduce Visual Studio warnings regarding conversions between float & double
+
+    static const RealOpenMM zero        =  0.0;
+    static const RealOpenMM one         =  1.0;
+    static const RealOpenMM six         =  6.0;
+    static const RealOpenMM twelve      = 12.0;
+
+    RealOpenMM sig2                     = inverseR*sig;
+               sig2                    *= sig2;
+    RealOpenMM sig6                     = sig2*sig2*sig2;
+              *dEdR                     = eps*( twelve*sig6 - six )*sig6;
+               
+        *energy                        += eps*(sig6-one)*sig6;
+
+    return ReferenceForce::DefaultReturn;
+}
+
+  /**---------------------------------------------------------------------------------------
+
+     Calculate softcore LJ pair ixn between two atoms
+
+     @param r                r
+     @param sig              sigma
+     @param eps              epsilon
+     @param lambda           lambda
+     @param dEdR             output force factor
+     @param energy           LJ energy
+
+     @return ReferenceForce::DefaultReturn
+
+     --------------------------------------------------------------------------------------- */
+
+int ReferenceFreeEnergyLJCoulombSoftcoreIxn::calculateOneSoftCoreLJIxn( RealOpenMM r, RealOpenMM sig, RealOpenMM eps,
+                                                                        RealOpenMM lambda,
+                                                                        RealOpenMM* dEdR, RealOpenMM* energy ) const {
+
+    // ---------------------------------------------------------------------------------------
+
+    //static const std::string methodName = "\nReferenceFreeEnergyLJCoulombSoftcoreIxn::calculateOneSoftCoreLJIxn";
+
+    // ---------------------------------------------------------------------------------------
+
+    // constants -- reduce Visual Studio warnings regarding conversions between float & double
+
+    static const RealOpenMM zero        =  0.0;
+    static const RealOpenMM one         =  1.0;
+    static const RealOpenMM six         =  6.0;
+    static const RealOpenMM twelve      = 12.0;
+    static const RealOpenMM alphaLJ     = 0.5;
+
+#if 0
+RealOpenMM dEdROrig = 0.0;
+RealOpenMM E_Orig   = 0.0;
+static int maxPrint = 0;
+calculateOneLJIxn( one/r, sig, eps, &dEdROrig, &E_Orig );
+#endif
+
+    // soft-core LJ energy = lambda*4*eps*[ 1/{alphaLJ*(1-lambda) + (r/sig)**6}**2 - 1/{alphaLJ*(1-lambda) + (r/sig)**6} ]
+
+    eps                                *= lambda;
+
+    RealOpenMM sig2                     = r/sig;
+               sig2                    *= sig2;
+    RealOpenMM sig6                     = sig2*sig2*sig2;
+
+    RealOpenMM softcoreLJTerm           = alphaLJ*(one -  lambda) + sig6;
+    RealOpenMM softcoreLJInv            = one/softcoreLJTerm;
+    RealOpenMM softcoreLJInv2           = softcoreLJInv*softcoreLJInv;
+
+    *dEdR                               = eps*softcoreLJInv2*( twelve*softcoreLJInv - six )*sig6;
+               
+    *energy                            += eps*softcoreLJInv*( softcoreLJInv - one );
+
+#if 0
+if( maxPrint++ < 5 ){
+   printf( "r=%14.6e sig=%14.6e eps=%14.6e lambda=%14.6e de[%14.6e %14.6e] e[%14.6e %14.6e] %14.6e %14.6e\n",
+           r, sig, eps/lambda, lambda, dEdROrig, *dEdR, E_Orig, *energy, softcoreLJInv, sig6 ); 
+}
+#endif
+    return ReferenceForce::DefaultReturn;
+}
--- a/plugins/freeEnergy/platforms/reference/src/SimTKReference/ReferenceFreeEnergyLJCoulombSoftcoreIxn.h
+++ b/plugins/freeEnergy/platforms/reference/src/SimTKReference/ReferenceFreeEnergyLJCoulombSoftcoreIxn.h
+#ifndef __ReferenceFreeEnergyLJCoulombSoftcoreIxn_H__
+#define __ReferenceFreeEnergyLJCoulombSoftcoreIxn_H__
+
+/* Portions copyright (c) 2006 Stanford University and Simbios.
+ * Contributors: Pande Group
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject
+ * to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "SimTKReference/ReferencePairIxn.h"
+#include "SimTKReference/ReferenceNeighborList.h"
+
+// ---------------------------------------------------------------------------------------
+
+class ReferenceFreeEnergyLJCoulombSoftcoreIxn : public ReferencePairIxn {
+
+   private:
+       
+      bool cutoff;
+      bool periodic;
+      bool ewald;
+      bool pme;
+      const OpenMM::NeighborList* neighborList;
+      RealOpenMM periodicBoxSize[3];
+      RealOpenMM cutoffDistance;
+      RealOpenMM krf, crf;
+      RealOpenMM softCoreLJLambda;
+      int numRx, numRy, numRz;
+      RealOpenMM alphaEwald;
+
+      // parameter indices
+
+      static const int SigIndex                   = 0;
+      static const int EpsIndex                   = 1;
+      static const int   QIndex                   = 2;
+      static const int SoftCoreLJLambdaIndex      = 3;
+            
+      /**---------------------------------------------------------------------------------------
+      
+         Calculate LJ Coulomb pair ixn between two atoms
+      
+         @param atom1            the index of the first atom
+         @param atom2            the index of the second atom
+         @param atomCoordinates  atom coordinates
+         @param atomParameters   atom parameters (charges, c6, c12, ...)     atomParameters[atomIndex][paramterIndex]
+         @param forces           force array (forces added)
+         @param energyByAtom     atom energy
+         @param totalEnergy      total energy
+      
+         @return ReferenceForce::DefaultReturn
+            
+         --------------------------------------------------------------------------------------- */
+          
+      int calculateOneIxn( int atom1, int atom2, RealOpenMM** atomCoordinates,
+                            RealOpenMM** atomParameters, RealOpenMM** forces,
+                            RealOpenMM* energyByAtom, RealOpenMM* totalEnergy ) const;
+
+
+   public:
+
+      /**---------------------------------------------------------------------------------------
+      
+         Constructor
+      
+         --------------------------------------------------------------------------------------- */
+
+       ReferenceFreeEnergyLJCoulombSoftcoreIxn( );
+
+      /**---------------------------------------------------------------------------------------
+      
+         Destructor
+      
+         --------------------------------------------------------------------------------------- */
+
+       ~ReferenceFreeEnergyLJCoulombSoftcoreIxn( );
+
+      /**---------------------------------------------------------------------------------------
+      
+         Set the force to use a cutoff.
+      
+         @param distance            the cutoff distance
+         @param neighbors           the neighbor list to use
+         @param solventDielectric   the dielectric constant of the bulk solvent
+      
+         @return ReferenceForce::DefaultReturn
+      
+         --------------------------------------------------------------------------------------- */
+      
+      int setUseCutoff( RealOpenMM distance, const OpenMM::NeighborList& neighbors, RealOpenMM solventDielectric );
+      
+      /**---------------------------------------------------------------------------------------
+      
+         Set the force to use periodic boundary conditions.  This requires that a cutoff has
+         already been set, and the smallest side of the periodic box is at least twice the cutoff
+         distance.
+      
+         @param boxSize             the X, Y, and Z widths of the periodic box
+      
+         @return ReferenceForce::DefaultReturn
+      
+         --------------------------------------------------------------------------------------- */
+      
+      int setPeriodic( RealOpenMM* boxSize );      
+       
+      /**---------------------------------------------------------------------------------------
+      
+         Set the force to use Ewald summation.
+      
+         @param alpha  the Ewald separation parameter
+         @param kmaxx  the largest wave vector in the x direction
+         @param kmaxy  the largest wave vector in the y direction
+         @param kmaxz  the largest wave vector in the z direction
+      
+         --------------------------------------------------------------------------------------- */
+      
+      void setUseEwald(RealOpenMM alpha, int kmaxx, int kmaxy, int kmaxz);
+
+     
+      /**---------------------------------------------------------------------------------------
+      
+         Set the force to use Particle-Mesh Ewald (PME) summation.
+      
+         @param alpha  the Ewald separation parameter
+      
+         --------------------------------------------------------------------------------------- */
+      
+      void setUsePME(RealOpenMM alpha);
+
+      /**---------------------------------------------------------------------------------------
+      
+         Set the soft core LJ lambda
+      
+         @param lambda the soft core LJ lambda
+      
+         --------------------------------------------------------------------------------------- */
+      
+      void setSoftCoreLJLambda(RealOpenMM lambda);
+
+      /**---------------------------------------------------------------------------------------
+      
+         Calculate parameters for LJ 1-4 ixn
+      
+         @param c6               c6
+         @param c12              c12
+         @param q1               q1 charge atom
+         @param epsfacSqrt       epsfacSqrt (what is this?)
+         @param parameters       output parameters:
+                                    parameter[SigIndex]  = sqrt(c6*c6/c12)
+                                    parameter[EpsIndex]  = 0.5*( (c12/c6)**1/6 )
+                                    parameter[QIndex]    = epsfactorSqrt*q1
+      
+         @return ReferenceForce::DefaultReturn
+      
+         --------------------------------------------------------------------------------------- */
+      
+      int getDerivedParameters( RealOpenMM c6, RealOpenMM c12, RealOpenMM q1, 
+                                RealOpenMM epsfacSqrt,
+                                RealOpenMM* parameters ) const;
+      
+      /**---------------------------------------------------------------------------------------
+      
+         Calculate LJ Coulomb pair ixn
+      
+         @param numberOfAtoms    number of atoms
+         @param atomCoordinates  atom coordinates
+         @param atomParameters   atom parameters (charges, c6, c12, ...)     atomParameters[atomIndex][paramterIndex]
+         @param exclusions       atom exclusion indices                      exclusions[atomIndex][atomToExcludeIndex]
+                                 exclusions[atomIndex][0] = number of exclusions
+                                 exclusions[atomIndex][1-no.] = atom indices of atoms to excluded from
+                                 interacting w/ atom atomIndex
+         @param fixedParameters  non atom parameters (not currently used)
+         @param forces           force array (forces added)
+         @param energyByAtom     atom energy
+         @param totalEnergy      total energy
+      
+         @return ReferenceForce::DefaultReturn
+            
+         --------------------------------------------------------------------------------------- */
+          
+      int calculatePairIxn( int numberOfAtoms, RealOpenMM** atomCoordinates,
+                            RealOpenMM** atomParameters, int** exclusions,
+                            RealOpenMM* fixedParameters, RealOpenMM** forces,
+                            RealOpenMM* energyByAtom, RealOpenMM* totalEnergy ) const;
+
+private:
+      /**---------------------------------------------------------------------------------------
+      
+         Calculate Ewald ixn
+      
+         @param numberOfAtoms    number of atoms
+         @param atomCoordinates  atom coordinates
+         @param atomParameters   atom parameters (charges, c6, c12, ...)     atomParameters[atomIndex][paramterIndex]
+         @param exclusions       atom exclusion indices                      exclusions[atomIndex][atomToExcludeIndex]
+                                 exclusions[atomIndex][0] = number of exclusions
+                                 exclusions[atomIndex][1-no.] = atom indices of atoms to excluded from
+                                 interacting w/ atom atomIndex
+         @param fixedParameters  non atom parameters (not currently used)
+         @param forces           force array (forces added)
+         @param energyByAtom     atom energy
+         @param totalEnergy      total energy
+
+         @return ReferenceForce::DefaultReturn
+            
+         --------------------------------------------------------------------------------------- */
+          
+      int calculateEwaldIxn( int numberOfAtoms, RealOpenMM** atomCoordinates,
+                            RealOpenMM** atomParameters, int** exclusions,
+                            RealOpenMM* fixedParameters, RealOpenMM** forces,
+                            RealOpenMM* energyByAtom, RealOpenMM* totalEnergy ) const;
+      
+      /**---------------------------------------------------------------------------------------
+      
+         Calculate PME ixn
+      
+         @param numberOfAtoms    number of atoms
+         @param atomCoordinates  atom coordinates
+         @param atomParameters   atom parameters (charges, c6, c12, ...)     atomParameters[atomIndex][paramterIndex]
+         @param exclusions       atom exclusion indices                      exclusions[atomIndex][atomToExcludeIndex]
+                                 exclusions[atomIndex][0] = number of exclusions
+                                 exclusions[atomIndex][1-no.] = atom indices of atoms to excluded from
+                                 interacting w/ atom atomIndex
+         @param fixedParameters  non atom parameters (not currently used)
+         @param forces           force array (forces added)
+         @param energyByAtom     atom energy
+         @param totalEnergy      total energy
+
+         @return ReferenceForce::DefaultReturn
+            
+         --------------------------------------------------------------------------------------- */
+          
+      int calculatePMEIxn( int numberOfAtoms, RealOpenMM** atomCoordinates,
+                            RealOpenMM** atomParameters, int** exclusions,
+                            RealOpenMM* fixedParameters, RealOpenMM** forces,
+                            RealOpenMM* energyByAtom, RealOpenMM* totalEnergy ) const;
+
+        /**---------------------------------------------------------------------------------------
+      
+           Calculate LJ pair ixn between two atoms
+      
+           @param inverseR         1/r
+           @param sig              sigma
+           @param eps              epsilon
+           @param dEdR             output force factor
+           @param energy           LJ energy
+      
+           @return ReferenceForce::DefaultReturn
+      
+           --------------------------------------------------------------------------------------- */
+      
+      int calculateOneLJIxn( RealOpenMM inverseR, RealOpenMM sig, RealOpenMM eps,
+                             RealOpenMM* dEdR, RealOpenMM* energy ) const;
+      
+        /**---------------------------------------------------------------------------------------
+      
+           Calculate softcore LJ pair ixn between two atoms
+      
+           @param r                r
+           @param sig              sigma
+           @param eps              epsilon
+           @param lambda           lambda
+           @param dEdR             output force factor
+           @param energy           LJ energy
+      
+           @return ReferenceForce::DefaultReturn
+      
+           --------------------------------------------------------------------------------------- */
+      
+      int calculateOneSoftCoreLJIxn( RealOpenMM r, RealOpenMM sig, RealOpenMM eps,
+                                     RealOpenMM lambda, RealOpenMM* dEdR, RealOpenMM* energy ) const;
+      
+      
+};
+
+// ---------------------------------------------------------------------------------------
+
+#endif // __ReferenceFreeEnergyLJCoulombSoftcoreIxn_H__