Deleted free energy plugin

873552ba · Peter Eastman · c775bd19 · c775bd19 · c775bd19 · c775bd19
Commit 873552ba authored Nov 16, 2012 by Peter Eastman
20 changed files
--- a/plugins/freeEnergy/platforms/cuda/src/CudaFreeEnergyKernelFactory.cpp
+++ b/plugins/freeEnergy/platforms/cuda/src/CudaFreeEnergyKernelFactory.cpp
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "CudaFreeEnergyKernelFactory.h"
-#include "CudaFreeEnergyKernels.h"
-#include "openmm/freeEnergyKernels.h"
-#include "FreeEnergyCudaData.h"
-#include "openmm/internal/ContextImpl.h"
-#include "openmm/OpenMMException.h"
-#include "kernels/GpuFreeEnergyCudaKernels.h"
-
-using namespace OpenMM;
-
-extern "C" void registerPlatforms() {
-}
-
-extern "C" void registerKernelFactories() {
-    // (void) fprintf( stderr, "initOpenMMCudaFreeEnergyPlugin called\n");
-    if ( gpuIsAvailableSoftcore() ){
-        for( int ii = 0; ii < Platform::getNumPlatforms(); ii++ ){
-            Platform& platform = Platform::getPlatform(ii);
-            if( platform.getName().compare( "Cuda" ) == 0 ){
-                 CudaFreeEnergyKernelFactory* factory = new CudaFreeEnergyKernelFactory();
-                 platform.registerKernelFactory(CalcNonbondedSoftcoreForceKernel::Name(), factory);
-                 platform.registerKernelFactory(CalcGBSAOBCSoftcoreForceKernel::Name(), factory);
-                 platform.registerKernelFactory(CalcGBVISoftcoreForceKernel::Name(), factory);
-            }
-        }
-    }   
-}
-
-extern "C" OPENMMCUDA_EXPORT void registerFreeEnergyCudaKernelFactories( void ) { 
-    int hasCudaPlatform = 0;
-    for( int ii = 0; ii < Platform::getNumPlatforms() && hasCudaPlatform == 0; ii++ ){
-        Platform& platform = Platform::getPlatform(ii);
-        if( platform.getName() == "Cuda" ){
-            hasCudaPlatform = 1;
-        }   
-    }   
-    if( hasCudaPlatform == 0 ){
-        if (gpuIsAvailable() ){
-            Platform::registerPlatform(new CudaPlatform());
-        }   
-    }   
-    registerKernelFactories();
-}
-
-static std::map<ContextImpl*, FreeEnergyCudaData*> contextToFreeEnergyDataMap;
-
-// look up FreeEnergyCudaData for input contextImpl in contextToFreeEnergyDataMap
-
-extern "C" void* getFreeEnergyCudaData( ContextImpl& context ) { 
-    std::map<ContextImpl*, FreeEnergyCudaData*>::const_iterator mapIterator  = contextToFreeEnergyDataMap.find(&context);
-    if( mapIterator == contextToFreeEnergyDataMap.end() ){
-        return NULL;
-    } else {
-        return static_cast<void*>(mapIterator->second);
-    }   
-}
-
-// remove FreeEnergyCudaData from contextToFreeEnergyDataMap
-
-extern "C" void removeFreeEnergyCudaDataFromContextMap( void* inputContext ) { 
-    ContextImpl* context = static_cast<ContextImpl*>(inputContext);
-    contextToFreeEnergyDataMap.erase( context );
-    return;
-}
-
-KernelImpl* CudaFreeEnergyKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
-
-    // create FreeEnergyCudaData object if contextToFreeEnergyDataMap does not contain
-    // key equal to current context
-
-    FreeEnergyCudaData* freeEnergyCudaData;
-    std::map<ContextImpl*, FreeEnergyCudaData*>::const_iterator mapIterator  = contextToFreeEnergyDataMap.find(&context);
-
-    if( mapIterator == contextToFreeEnergyDataMap.end() ){
-        CudaPlatform::PlatformData& cudaPlatformData = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
-        freeEnergyCudaData                           = new FreeEnergyCudaData( cudaPlatformData );
-        contextToFreeEnergyDataMap[&context]         = freeEnergyCudaData;
-        //freeEnergyCudaData->setLog( stderr );
-        freeEnergyCudaData->setContextImpl( static_cast<void*>(&context) );
-    } else {
-        freeEnergyCudaData                           = mapIterator->second;
-    }
-
-    if (name == CalcNonbondedSoftcoreForceKernel::Name())
-        return new CudaFreeEnergyCalcNonbondedSoftcoreForceKernel(name, platform, *freeEnergyCudaData, context.getSystem());
-
-    if (name == CalcGBSAOBCSoftcoreForceKernel::Name())
-        return new CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel(name, platform, *freeEnergyCudaData);
-
-    if (name == CalcGBVISoftcoreForceKernel::Name())
-        return new CudaFreeEnergyCalcGBVISoftcoreForceKernel(name, platform, *freeEnergyCudaData);
-
-    throw OpenMMException( (std::string("Tried to create kernel with illegal kernel name '") + name + "'").c_str() );
-}
--- a/plugins/freeEnergy/platforms/cuda/src/CudaFreeEnergyKernels.cpp
+++ b/plugins/freeEnergy/platforms/cuda/src/CudaFreeEnergyKernels.cpp
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "CudaFreeEnergyKernels.h"
-#include "CudaForceInfo.h"
-
-#include "openmm/Context.h"
-#include "openmm/OpenMMException.h"
-#include "openmm/internal/ContextImpl.h"
-#include "kernels/gputypes.h"
-#include "kernels/cudaKernels.h"
-#include "kernels/GpuFreeEnergyCudaKernels.h" 
-
-#include <cmath>
-#include <map>
-#include <cstring>
-#include <cstdlib>
-#include <typeinfo>
-
-using namespace OpenMM;
-
-typedef std::map< std::string, int > MapStringInt;
-typedef MapStringInt::iterator MapStringIntI;
-typedef MapStringInt::const_iterator MapStringIntCI;
-
-// force names
-
-const std::string HARMONIC_BOND_FORCE             = "HarmonicBond";
-const std::string HARMONIC_ANGLE_FORCE            = "HarmonicBond";
-const std::string PERIODIC_TORSION_FORCE          = "PeriodicTorsion";
-const std::string RB_TORSION_FORCE                = "RbTorsion";
-
-const std::string NB_FORCE                        = "Nb";
-const std::string NB_SOFTCORE_FORCE               = "NbSoftcore";
-
-const std::string NB_EXCEPTION_FORCE              = "NbException";
-const std::string NB_EXCEPTION_SOFTCORE_FORCE     = "NbSoftcoreException";
-
-const std::string GBSA_OBC_FORCE                  = "Obc";
-const std::string GBSA_OBC_SOFTCORE_FORCE         = "ObcSoftcore";
-
-const std::string GBVI_FORCE                      = "GBVI";
-const std::string GBVI_SOFTCORE_FORCE             = "GBVISoftcore";
-
-static void getForceMap(const System& system, MapStringInt& forceMap, FILE* log) { 
-
-    // check forces and relevant parameters
-
-    for(int i = 0; i < system.getNumForces(); ++i) {
-
-        int hit                 = 0;
-        const Force& force      = system.getForce(i);
-         std::string forceName  = "NA";
-
-        // bond
-
-        if( !hit ){
-
-            try {
-               const HarmonicBondForce& harmonicBondForce = dynamic_cast<const HarmonicBondForce&>(force);
-               forceMap[HARMONIC_BOND_FORCE]              = 1;
-               forceName                                  = HARMONIC_BOND_FORCE;
-               hit++;
-            } catch( std::bad_cast ){
-            }
-        }
-
-        // angle
-
-        if( !hit ){
-
-            try {
-               const HarmonicAngleForce& harmonicAngleForce = dynamic_cast<const HarmonicAngleForce&>(force);
-               forceMap[HARMONIC_ANGLE_FORCE]               = 1;
-               forceName                                    = HARMONIC_ANGLE_FORCE;
-               hit++;
-            } catch( std::bad_cast ){
-            }
-        }
-
-        // PeriodicTorsionForce
-
-        if( !hit ){
-
-            try {
-               const PeriodicTorsionForce & periodicTorsionForce = dynamic_cast<const PeriodicTorsionForce&>(force);
-               forceMap[PERIODIC_TORSION_FORCE]                  = 1;
-               forceName                                         = PERIODIC_TORSION_FORCE;
-               hit++;
-            } catch( std::bad_cast ){
-            }
-        }
-
-        // RBTorsionForce
-
-        if( !hit ){
-            try {
-               const RBTorsionForce& rBTorsionForce = dynamic_cast<const RBTorsionForce&>(force);
-               forceMap[RB_TORSION_FORCE]           = 1;
-               forceName                            = RB_TORSION_FORCE;
-               hit++;
-            } catch( std::bad_cast ){
-            }
-        }
-
-        // nonbonded
-
-        if( !hit ){
-            try {
-               const NonbondedForce& nbForce = dynamic_cast<const NonbondedForce&>(force);
-               forceMap[NB_FORCE]            = 1;
-               forceName                     = NB_FORCE;
-            } catch( std::bad_cast ){
-            }
-        }
-
-        // nonbonded softcore
-
-        if( !hit ){
-            try {
-               const NonbondedSoftcoreForce& nbForce = dynamic_cast<const NonbondedSoftcoreForce&>(force);
-               forceMap[NB_SOFTCORE_FORCE]           = 1;
-               forceName                             = NB_SOFTCORE_FORCE;
-            } catch( std::bad_cast ){
-            }
-        }
-
-        // GBSA OBC
-
-        if( !hit ){
-            try {
-               const GBSAOBCForce& obcForce       = dynamic_cast<const GBSAOBCForce&>(force);
-               forceMap[GBSA_OBC_FORCE]           = 1;
-               forceName                          = GBSA_OBC_FORCE;
-               hit++;
-            } catch( std::bad_cast ){
-            }
-        }
-
-        // GBSA OBC softcore
-
-        if( !hit ){
-            try {
-               const GBSAOBCSoftcoreForce& obcForce = dynamic_cast<const GBSAOBCSoftcoreForce&>(force);
-               forceMap[GBSA_OBC_SOFTCORE_FORCE]    = 1;
-               forceName                            = GBSA_OBC_SOFTCORE_FORCE;
-               hit++;
-            } catch( std::bad_cast ){
-            }
-        }
-
-        // GB/VI
-
-        if( !hit ){
-            try {
-               const GBVIForce& obcForce  = dynamic_cast<const GBVIForce&>(force);
-               forceMap[GBVI_FORCE]       = 1;
-               forceName                  = GBVI_FORCE;
-               hit++;
-            } catch( std::bad_cast ){
-            }
-        }
-
-        // GB/VI softcore
-
-        if( !hit ){
-            try {
-               const GBVISoftcoreForce& gbviForce = dynamic_cast<const GBVISoftcoreForce&>(force);
-               forceMap[GBVI_SOFTCORE_FORCE]      = 1;
-               forceName                          = GBVI_SOFTCORE_FORCE;
-               hit++;
-            } catch( std::bad_cast ){
-            }
-        }
-
-        if( log ){
-            (void) fprintf( log, "Map: Force %d %s\n", i, forceName.c_str() );
-        }
-     }
-}
-
-class CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const NonbondedSoftcoreForce& force) : force(force) {
-    }    
-    bool areParticlesIdentical(int particle1, int particle2) {
-        double charge1, charge2, sigma1, sigma2, epsilon1, epsilon2, softcoreLJLambda1, softcoreLJLambda2;
-        force.getParticleParameters(particle1, charge1, sigma1, epsilon1, softcoreLJLambda1);
-        force.getParticleParameters(particle2, charge2, sigma2, epsilon2, softcoreLJLambda2);
-        return (charge1 == charge2 && sigma1 == sigma2 && epsilon1 == epsilon2 && softcoreLJLambda1 == softcoreLJLambda2);
-    }    
-    int getNumParticleGroups() {
-        return force.getNumExceptions();
-    }    
-    void getParticlesInGroup(int index, std::vector<int>& particles) {
-        int particle1, particle2;
-        double chargeProd, sigma, epsilon, softcoreLJLambda;
-        force.getExceptionParameters(index, particle1, particle2, chargeProd, sigma, epsilon, softcoreLJLambda);
-        particles.resize(2);
-        particles[0] = particle1;
-        particles[1] = particle2;
-    }    
-    bool areGroupsIdentical(int group1, int group2) {
-        int particle1, particle2;
-        double chargeProd1, chargeProd2, sigma1, sigma2, epsilon1, epsilon2, softcoreLJLambda1, softcoreLJLambda2;
-        force.getExceptionParameters(group1, particle1, particle2, chargeProd1, sigma1, epsilon1, softcoreLJLambda1);
-        force.getExceptionParameters(group2, particle1, particle2, chargeProd2, sigma2, epsilon2, softcoreLJLambda2);
-        return (chargeProd1 == chargeProd2 && sigma1 == sigma2 && epsilon1 == epsilon2 && softcoreLJLambda1 == softcoreLJLambda2);
-    }    
-private:
-    const NonbondedSoftcoreForce& force;
-};
-
-CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::~CudaFreeEnergyCalcNonbondedSoftcoreForceKernel() {
-    if( 0 && data.getLog() ){
-        (void) fprintf( data.getLog(), "~CudaFreeEnergyCalcNonbondedSoftcoreForceKernel called.\n" );
-        (void) fflush( data.getLog() );
-    }
-    data.decrementKernelCount();
-}
-
-void CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::initialize(const System& system, const NonbondedSoftcoreForce& force) {
-
-// ---------------------------------------------------------------------------------------
-
-   static const std::string methodName      = "CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::initialize";
-
-// ---------------------------------------------------------------------------------------
-
-    if( data.getLog() ){
-        (void) fprintf( data.getLog(), "%s called.\n", methodName.c_str() );
-        (void) fflush( data.getLog() );
-    }
-
-    // check forces and relevant parameters
-
-    MapStringInt forceMap;
-    getForceMap( system, forceMap, data.getLog() );
-
-    int softcore        = 0;
-    if( forceMap.find( GBSA_OBC_FORCE ) != forceMap.end() ){
-       setIncludeGBSA( true );
-    }
-
-    if( forceMap.find( GBSA_OBC_SOFTCORE_FORCE ) != forceMap.end() ){
-       setIncludeGBSA( true );
-       softcore++;
-    }
-
-    if( forceMap.find( GBVI_FORCE ) != forceMap.end() ){
-       setIncludeGBVI( true );
-    }
-    if( forceMap.find( GBVI_SOFTCORE_FORCE ) != forceMap.end() ){
-       setIncludeGBVI( true );
-       softcore++;
-    }
-
-    if( forceMap.find( NB_SOFTCORE_FORCE ) != forceMap.end() ){
-       softcore++;
-    }
-
-    setIncludeSoftcore( softcore );
-
-    numParticles      = force.getNumParticles();
-
-    // Identify which exceptions are 1-4 interactions.
-
-    std::vector<pair<int, int> > exclusions;
-    std::vector<int> exceptions;
-    for (int i = 0; i < force.getNumExceptions(); i++) {
-        int particle1, particle2;
-        double chargeProd, sigma, epsilon, softcoreLJLambda;
-        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon, softcoreLJLambda);
-        exclusions.push_back(pair<int, int>(particle1, particle2));
-        if (chargeProd != 0.0 || epsilon != 0.0)
-            exceptions.push_back(i);
-    }
-
-    // Initialize nonbonded interactions.
-    
-    if( numParticles > 0 ){
-        std::vector<int> particle(numParticles);
-        std::vector<float> c6(numParticles);
-        std::vector<float> c12(numParticles);
-        std::vector<float> q(numParticles);
-        std::vector<float> softcoreLJLambdaArray(numParticles);
-        std::vector<char> symbol;
-        std::vector<std::vector<int> > exclusionList(numParticles);
-        for (int i = 0; i < numParticles; i++) {
-            double charge, radius, depth, softcoreLJLambda;
-            force.getParticleParameters(i, charge, radius, depth, softcoreLJLambda);
-            particle[i]              = i;
-            q[i]                     = static_cast<float>( charge );
-            c6[i]                    = static_cast<float>( (4*depth*pow(radius, 6.0)) );
-            c12[i]                   = static_cast<float>( (4*depth*pow(radius, 12.0)) );
-            softcoreLJLambdaArray[i] = static_cast<float>( softcoreLJLambda );
-            exclusionList[i].push_back(i);
-        }
-
-        for (int i = 0; i < (int)exclusions.size(); i++) {
-            exclusionList[exclusions[i].first].push_back(exclusions[i].second);
-            exclusionList[exclusions[i].second].push_back(exclusions[i].first);
-        }
-        Vec3 boxVectors[3];
-        system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
-        freeEnergyGpuSetPeriodicBoxSize( data.getFreeEnergyGpu(), static_cast<float>(boxVectors[0][0] ), static_cast<float>(boxVectors[1][1] ), static_cast<float>(boxVectors[2][2] ));
-
-        CudaFreeEnergyNonbondedMethod method = FREE_ENERGY_NO_CUTOFF;
-        if (force.getNonbondedMethod() != NonbondedSoftcoreForce::NoCutoff) {
-            method = FREE_ENERGY_CUTOFF;
-        }
-        if (force.getNonbondedMethod() == NonbondedSoftcoreForce::CutoffPeriodic) {
-            method = FREE_ENERGY_PERIODIC;
-        }
-
-        // setup parameters
-
-        gpuSetNonbondedSoftcoreParameters( data.getFreeEnergyGpu(), 138.935485f, particle, c6, c12, q,
-                                           softcoreLJLambdaArray, symbol, exclusionList, method,
-                                           static_cast<float>(force.getCutoffDistance() ), static_cast<float>(force.getReactionFieldDielectric()));
-    }
-
-    // Initialize 1-4 nonbonded interactions.
-
-    numExceptions = exceptions.size();
-    if( numExceptions > 0 ){
-        std::vector<int> particle1(numExceptions);
-        std::vector<int> particle2(numExceptions);
-        std::vector<float> c6(numExceptions);
-        std::vector<float> c12(numExceptions);
-        std::vector<float> qProd(numExceptions);
-        std::vector<float> softcoreLJLambdaArray(numExceptions);
-        for (int i = 0; i < numExceptions; i++) {
-            double charge, sig, eps, softcoreLJLambda;
-            force.getExceptionParameters(exceptions[i], particle1[i], particle2[i], charge, sig, eps, softcoreLJLambda);
-            c6[i]                    = static_cast<float>( (4.0*eps*pow(sig, 6.0)) );
-            c12[i]                   = static_cast<float>( (4.0*eps*pow(sig, 12.0)) );
-            qProd[i]                 = static_cast<float>( charge );
-            softcoreLJLambdaArray[i] = static_cast<float>( softcoreLJLambda );
-        }
-        gpuSetLJ14SoftcoreParameters( data.getFreeEnergyGpu(), 138.935485f, particle1, particle2, c6, c12, qProd, softcoreLJLambdaArray);
-    } else if( data.getLog() ){
-        (void) fprintf( data.getLog(), "Mo nonbonded softcore exceptions.\n" );
-        (void) fflush( data.getLog() );
-    }
-
-    data.getFreeEnergyGpu()->gpuContext->forces.push_back(new ForceInfo(force));
-}
-
-double CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::execute( ContextImpl& context, bool includeForces, bool includeEnergy ){
-
-// ---------------------------------------------------------------------------------------
-
-   static const std::string methodName      = "CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::executeForces";
-
-// ---------------------------------------------------------------------------------------
-
-    freeEnergyGpuContext gpu = data.getFreeEnergyGpu();
-
-    data.initializeGpu( );
- 
-    // calculate nonbonded ixns here, only if implicit solvent is inactive
-
-    if ( !getIncludeGBSA() && !getIncludeGBVI() ) {
-        kCalculateCDLJSoftcoreForces(gpu);
-    }
-
-    // local LJ-14 forces
-
-    if( getNumExceptions() > 0 ){
-        kCalculateLocalSoftcoreForces(gpu);
-    }
-
-    return 0.0;
-}
-
-bool CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::getIncludeGBSA( void ) const {
-    return bIncludeGBSA;
-}
-
-void CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::setIncludeGBSA( bool inputIncludeGBSA ){
-    bIncludeGBSA = inputIncludeGBSA;
-}
-
-bool CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::getIncludeGBVI( void ) const {
-    return bIncludeGBVI;
-}
-
-void CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::setIncludeGBVI( bool inputIncludeGBVI ){
-    bIncludeGBVI = inputIncludeGBVI;
-}
-
-int CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::getIncludeSoftcore( void ) const {
-    return includeSoftcore;
-}
-
-int CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::getNumExceptions( void ) const {
-    return numExceptions;
-}
-
-void CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::setIncludeSoftcore( int inputIncludeSoftcore ){
-    includeSoftcore = inputIncludeSoftcore;
-}
-
-class CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const GBSAOBCSoftcoreForce& force) : force(force) {
-    }
-    bool areParticlesIdentical(int particle1, int particle2) {
-        double charge1, charge2, radius1, radius2, scale1, scale2, particleNonPolarScalingFactor1, particleNonPolarScalingFactor2;
-        force.getParticleParameters(particle1, charge1, radius1, scale1, particleNonPolarScalingFactor1);
-        force.getParticleParameters(particle2, charge2, radius2, scale2, particleNonPolarScalingFactor2);
-        return (charge1 == charge2 && radius1 == radius2 && scale1 == scale2 && particleNonPolarScalingFactor1 == particleNonPolarScalingFactor2);
-    }
-private:
-    const GBSAOBCSoftcoreForce& force;
-};
-
-CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::~CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel() {
-    if( 0 && data.getLog() ){
-        (void) fprintf( data.getLog(), "~CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel called.\n" );
-        (void) fflush( data.getLog() );
-    }
-    data.decrementKernelCount();
-}
-
-void CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::initialize(const System& system, const GBSAOBCSoftcoreForce& force) {
-
-// ---------------------------------------------------------------------------------------
-
-    freeEnergyGpuContext gpu  = data.getFreeEnergyGpu();
-
-    MapStringInt forceMap;
-    getForceMap( system, forceMap, log);
-
-    // check that nonbonded (non-softcore is not active)
-
-    if( forceMap.find( NB_FORCE ) != forceMap.end() ){ 
-        throw OpenMMException( "Mixing NonbondedForce and GBSAOBCSoftoreForce is not allowed -- use NonbondedSoftcoreForce " );
-    }
-    if( forceMap.find( NB_SOFTCORE_FORCE ) == forceMap.end() ){ 
-        throw OpenMMException( "NonbondedSoftcore force must be included w/ GBSAOBCSoftcore force." );
-    }
-
-    int numParticles = system.getNumParticles();
-
-    std::vector<float> radius(numParticles);
-    std::vector<float> scale(numParticles);
-    std::vector<float> charge(numParticles);
-    std::vector<float> nonPolarScalingFactors(numParticles);
-
-    for( int ii = 0; ii < numParticles; ii++ ){
-        double particleCharge, particleRadius, scalingFactor, particleNonPolarScalingFactor;
-        force.getParticleParameters( ii, particleCharge, particleRadius, scalingFactor, particleNonPolarScalingFactor);
-        radius[ii]                 = static_cast<float>( particleRadius);
-        scale[ii]                  = static_cast<float>( scalingFactor);
-        charge[ii]                 = static_cast<float>( particleCharge);
-        nonPolarScalingFactors[ii] = static_cast<float>( particleNonPolarScalingFactor);
-    }
-
-    gpuSetObcSoftcoreParameters( gpu, static_cast<float>( force.getSoluteDielectric()),
-                                 static_cast<float>( force.getSolventDielectric()),
-                                 static_cast<float>( force.getNonPolarPrefactor()),
-                                 radius, scale, charge, nonPolarScalingFactors );
-
-    data.getFreeEnergyGpu()->gpuContext->forces.push_back(new ForceInfo(force));
-    return;
-
-}
-
-double CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-
-// ---------------------------------------------------------------------------------------
-
-    freeEnergyGpuContext freeEnergyGpu = data.getFreeEnergyGpu();
-    gpuContext gpu                     = freeEnergyGpu->gpuContext;
-    int call = 0;
-    
-    // send address's of arrays, ... to device on first call
-    // required since force/energy buffers not set when CudaFreeEnergyCalcGBVISoftcoreForceKernel::initialize() was called
-
-    data.initializeGpu( );
-
-    // (1) clear Born force array
-    // (2) calculate Born radii and sum
-    // (3) loop 1
-    // (4) sum/calculate Born forces
-    // (5) loop 2
-
-    kClearSoftcoreBornForces(gpu);
-    kCalculateObcGbsaSoftcoreBornSum( freeEnergyGpu );
-    kReduceObcGbsaSoftcoreBornSum(gpu);
-    kCalculateCDLJObcGbsaSoftcoreForces1( freeEnergyGpu );
-
-    // sum Born forces and  execute second OBC loop
-
-    kReduceObcGbsaSoftcoreBornForces(gpu);
-    kCalculateObcGbsaSoftcoreForces2( freeEnergyGpu );
-
-    if( data.getLog() ){
-        kPrintObcGbsaSoftcore( freeEnergyGpu, "Post kCalculateObcGbsaSoftcoreForces2", call, data.getLog() );
-    }
-
-    return 0.0;
-}
-
-class CudaFreeEnergyCalcGBVISoftcoreForceKernel::ForceInfo : public CudaForceInfo {
-public:
-    ForceInfo(const GBVISoftcoreForce& force) : force(force) {
-    }    
-    bool areParticlesIdentical(int particle1, int particle2) {
-        double charge1, charge2, radius1, radius2, gamma1, gamma2, bornRadiusScaleFactor1, bornRadiusScaleFactor2;
-        force.getParticleParameters(particle1, charge1, radius1, gamma1, bornRadiusScaleFactor1);
-        force.getParticleParameters(particle2, charge2, radius2, gamma2, bornRadiusScaleFactor2);
-        return (charge1 == charge2 && radius1 == radius2 && gamma1 == gamma2 && bornRadiusScaleFactor1 == bornRadiusScaleFactor2);
-    }    
-private:
-    const GBVISoftcoreForce& force;
-};
-
-CudaFreeEnergyCalcGBVISoftcoreForceKernel::~CudaFreeEnergyCalcGBVISoftcoreForceKernel() {
-    if( 0 && data.getLog() ){
-        (void) fprintf( data.getLog(), "~CudaFreeEnergyCalcGBVISoftcoreForceKernel called.\n" );
-        (void) fflush( data.getLog() );
-    }
-    data.decrementKernelCount();
-}
-
-void CudaFreeEnergyCalcGBVISoftcoreForceKernel::initialize(const System& system, const GBVISoftcoreForce& force, const std::vector<double> & inputScaledRadii) {
-
-// ---------------------------------------------------------------------------------------
-
-    int numParticles          = system.getNumParticles();
-    freeEnergyGpuContext gpu  = data.getFreeEnergyGpu();
-
-    // check forces and relevant parameters
-
-    MapStringInt forceMap;
-    getForceMap( system, forceMap, log);
-
-    // check that nonbonded (non-softcore is not active)
-
-    if( forceMap.find( NB_FORCE ) != forceMap.end() ){ 
-        throw OpenMMException( "Mixing NonbondedForce and GBVISoftoreForce not allowed -- use NonbondedSoftcoreForce " );
-    }
-
-    std::vector<int>   particle(numParticles);
-    std::vector<float> radius(numParticles);
-    std::vector<float> scaledRadii(numParticles);
-    std::vector<float> gammas(numParticles);
-    std::vector<float> bornRadiusScaleFactors(numParticles);
-
-    for( int i = 0; i < numParticles; i++ ){
-        double charge, particleRadius, gamma, bornRadiusScaleFactor;
-        force.getParticleParameters(i, charge, particleRadius, gamma, bornRadiusScaleFactor);
-        particle[i]                  = i;
-        radius[i]                    = static_cast<float>( particleRadius );
-        gammas[i]                    = static_cast<float>( gamma );
-        scaledRadii[i]               = static_cast<float>( inputScaledRadii[i] );
-        bornRadiusScaleFactors[i]    = static_cast<float>( bornRadiusScaleFactor );
-    }
-
-    std::vector<float> quinticSplineParameters;
-    if( force.getBornRadiusScalingMethod() == GBVISoftcoreForce::QuinticSpline ){
-
-        // quintic spline
-
-        quinticSplineParameters.resize(2);
-        quinticSplineParameters[0] = static_cast<float>(force.getQuinticLowerLimitFactor());
-        quinticSplineParameters[1] = static_cast<float>(force.getQuinticUpperBornRadiusLimit());
-        quinticSplineParameters[1] = powf( quinticSplineParameters[1], -3.0f ); 
-        quinticScaling =  1;
-    }
-
-    // load parameters onto board
-    // defined in kCalculateGBVISoftcore.cu
-
-    gpuSetGBVISoftcoreParameters( gpu, static_cast<float>( force.getSoluteDielectric() ), static_cast<float>( force.getSolventDielectric() ),
-                                  particle, radius, gammas, scaledRadii, bornRadiusScaleFactors, quinticSplineParameters);
-
-    data.getFreeEnergyGpu()->gpuContext->forces.push_back(new ForceInfo(force));
-
-    return;
-}
-
-double CudaFreeEnergyCalcGBVISoftcoreForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
-
-    freeEnergyGpuContext freeEnergyGpu = data.getFreeEnergyGpu();
-    gpuContext gpu                     = freeEnergyGpu->gpuContext;
-    
-    // send address's of arrays, ... to device on first call
-    // required since force/energy buffers not set when CudaFreeEnergyCalcGBVISoftcoreForceKernel::initialize() was called
-
-    data.initializeGpu( );
-
-    // (1) clear Born force array
-    // (2) calculate Born radii and sum
-    // (3) loop 1
-    // (4) sum/calculate Born forces
-    // (5) loop 2
-
-    // calculate Born radii and first loop of GB/VI forces
-
-    kClearSoftcoreBornForces(gpu);
-
-    kCalculateGBVISoftcoreBornSum( freeEnergyGpu );
-
-    if( quinticScaling ){
-        kReduceGBVIBornSumQuinticScaling( freeEnergyGpu );
-    } else {
-        kReduceGBVISoftcoreBornSum( freeEnergyGpu );
-    }
-
-    kCalculateCDLJObcGbsaSoftcoreForces1( freeEnergyGpu );
-
-    if( quinticScaling ){
-        kReduceGBVIBornForcesQuinticScaling(freeEnergyGpu);
-    } else {
-        kReduceGBVISoftcoreBornForces( freeEnergyGpu );
-    }
-
-    // second loop of GB/VI forces
-
-    kCalculateGBVISoftcoreForces2( freeEnergyGpu );
-    if( data.getLog() ){
-        kPrintGBVISoftcore( freeEnergyGpu, "Post kCalculateGBVISoftcoreForces2", 0, data.getLog() );
-    }
-
-    return 0.0;
-}
--- a/plugins/freeEnergy/platforms/cuda/src/CudaFreeEnergyKernels.h
+++ b/plugins/freeEnergy/platforms/cuda/src/CudaFreeEnergyKernels.h
-#ifndef OPENMM_FREE_ENERGY_CUDA_KERNELS_H_
-#define OPENMM_FREE_ENERGY_CUDA_KERNELS_H_
-
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "CudaPlatform.h"
-#include "openmm/kernels.h"
-#include "kernels/gputypes.h"
-#include "openmm/System.h"
-#include "OpenMMFreeEnergy.h"
-#include "openmm/freeEnergyKernels.h"
-#include "FreeEnergyCudaData.h"
-
-namespace OpenMM {
-
-/**
- * This kernel is invoked by NonbondedSoftcoreForce to calculate the forces acting on the system.
- */
-class CudaFreeEnergyCalcNonbondedSoftcoreForceKernel : public CalcNonbondedSoftcoreForceKernel {
-
-public:
-    CudaFreeEnergyCalcNonbondedSoftcoreForceKernel(std::string name, const Platform& platform, FreeEnergyCudaData& data, System& system) :
-             CalcNonbondedSoftcoreForceKernel(name, platform), data(data), system(system) {
-
-        numExceptions        = 0;
-        numParticles         = 0;
-        bIncludeGBSA         = false;
-        bIncludeGBVI         = false;
-        includeSoftcore      = false;
-        log                  = NULL;
-        data.incrementKernelCount();
-    }
-
-    ~CudaFreeEnergyCalcNonbondedSoftcoreForceKernel();
-
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the NonbondedForce this kernel will be used for
-     */
-    void initialize(const System& system, const NonbondedSoftcoreForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-    /**
-     * Get flag signalling whether GBSA/OBC force is included
-     *
-     * @return flag
-     */
-    bool getIncludeGBSA( void ) const;
-    /**
-     * Set flag signalling whether GBSA/OBC force is included
-     *
-     * @param inputIncludeGBSA input flag value
-     */
-    void setIncludeGBSA( bool inputIncludeGBSA );
-    /**
-     * Get flag signalling whether GB/VI force is included
-     *
-     * @return flag
-     */
-    bool getIncludeGBVI( void ) const;
-    /**
-     * Set flag signalling whether GB/VI force is included
-     *
-     * @param inputIncludeGBVI input flag value
-     */
-    void setIncludeGBVI( bool inputIncludeGBVI );
-    /**
-     * Get flag signalling whether softcore force is included
-     *
-     * @return flag
-     */
-    int getIncludeSoftcore( void ) const;
-    /**
-     * Set flag signalling whether GB/VI force is included
-     *
-     * @param inputIncludeGBVI input flag value
-     */
-    void setIncludeSoftcore( int inputSoftcore);
-    /**
-     * Get number of exceptions
-     *
-     * @return number of exceptions
-     */
-    int getNumExceptions( void ) const;
-private:
-    FreeEnergyCudaData& data;
-    class ForceInfo;
-    int numParticles;
-    System& system;
-    bool bIncludeGBSA;
-    bool bIncludeGBVI;
-    int includeSoftcore;
-    int numExceptions;
-    FILE* log;
-};
-
-/**
- * This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
- */
-class CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel : public CalcGBSAOBCSoftcoreForceKernel {
-public:
-    CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel(std::string name, const Platform& platform, FreeEnergyCudaData& data) :
-       CalcGBSAOBCSoftcoreForceKernel(name, platform), data(data) {
-        log                  = NULL;
-        data.incrementKernelCount();
-    }
-    ~CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system     the System this kernel will be applied to
-     * @param force      the GBSAOBCForce this kernel will be used for
-     */
-    void initialize(const System& system, const GBSAOBCSoftcoreForce& force);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-private:
-    FreeEnergyCudaData& data;
-    class ForceInfo;
-    FILE* log;
-};
-
-/**
- * This kernel is invoked by GBVIForce to calculate the forces acting on the system.
- */
-class CudaFreeEnergyCalcGBVISoftcoreForceKernel : public CalcGBVISoftcoreForceKernel {
-public:
-    CudaFreeEnergyCalcGBVISoftcoreForceKernel(std::string name, const Platform& platform, FreeEnergyCudaData& data) :
-         CalcGBVISoftcoreForceKernel(name, platform), data(data) {
-
-        log                  = NULL;
-        quinticScaling       = 0;
-        data.incrementKernelCount();
-    }
-
-    ~CudaFreeEnergyCalcGBVISoftcoreForceKernel();
-    /**
-     * Initialize the kernel.
-     * 
-     * @param system      the System this kernel will be applied to
-     * @param force       the GBVIForce this kernel will be used for
-     * @param scaledRadii the scaled radii (Eq. 5 of Labute paper)
-     */
-    void initialize(const System& system, const GBVISoftcoreForce& force, const std::vector<double> & scaledRadii);
-    /**
-     * Execute the kernel to calculate the forces and/or energy.
-     *
-     * @param context        the context in which to execute this kernel
-     * @param includeForces  true if forces should be calculated
-     * @param includeEnergy  true if the energy should be calculated
-     * @return the potential energy due to the force
-     */
-    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-
-private:
-    FreeEnergyCudaData& data;
-    class ForceInfo;
-    FILE* log;
-    int quinticScaling;
-};
-
-} // namespace OpenMM
-
-#endif /*OPENMM_FREE_ENERGY_CUDA_KERNELS_H_*/
--- a/plugins/freeEnergy/platforms/cuda/src/FreeEnergyCudaData.cpp
+++ b/plugins/freeEnergy/platforms/cuda/src/FreeEnergyCudaData.cpp
-/* -------------------------------------------------------------------------- *
- *                               OpenMMFreeEnergy                                 *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
- * Authors:                                                                   *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "FreeEnergyCudaData.h"
-#include "openmm/OpenMMException.h"
-#include <sstream>
-
-extern "C" void removeFreeEnergyCudaDataFromContextMap( void* context ); 
-
-namespace OpenMM {
-
-FreeEnergyCudaData::FreeEnergyCudaData( CudaPlatform::PlatformData& data ) : cudaPlatformData(data) {
-        
-    kernelCount                   = 0;
-    freeEnergyGpu                 = freeEnergyGpuInit( cudaPlatformData.gpu );
-
-    log                           = NULL;
-    contextImpl                   = NULL;
-    gpuInitialized                = false;
-
-    boxDimensions[0]              = 0.0;
-    boxDimensions[1]              = 0.0;
-    boxDimensions[2]              = 0.0;
-}   
-
-FreeEnergyCudaData::~FreeEnergyCudaData() {
-    if( getLog() ){
-        (void) fprintf( getLog(), "~FreeEnergyCudaData called kernelCount=%d\n", kernelCount );
-        (void) fflush( getLog() );
-    }   
-    freeEnergyGpuShutDown( freeEnergyGpu );
-}
-
-void FreeEnergyCudaData::decrementKernelCount( void ) {
-
-    kernelCount--;
-    if( getLog() ){
-        (void) fprintf( getLog(), "~reeEnergyCudaData decrementKernelCount called. %d\n", kernelCount );
-        (void) fflush( getLog() );
-    }   
-    if( kernelCount == 0 && contextImpl != NULL ){
-        removeFreeEnergyCudaDataFromContextMap( contextImpl );
-        freeEnergyGpuShutDown( freeEnergyGpu );
-    }
-}
-
-void FreeEnergyCudaData::incrementKernelCount( void ) {
-    kernelCount++;
-}
-
-freeEnergyGpuContext FreeEnergyCudaData::getFreeEnergyGpu( void ) const {
-    return freeEnergyGpu;
-}
-
-void FreeEnergyCudaData::setLog( FILE* inputLog ) {
-    log            = inputLog;
-    freeEnergyGpu->log = inputLog;
-}
-
-FILE* FreeEnergyCudaData::getLog( void ) const {
-    return log;
-}
-
-void FreeEnergyCudaData::setContextImpl( void* inputContextImpl ) {
-    contextImpl = inputContextImpl;
-}
-
-void FreeEnergyCudaData::initializeGpu( void ) {
-
-    if( !gpuInitialized ){
-
-        gpuContext gpu = freeEnergyGpu->gpuContext;
-
-        if( freeEnergyGpu->freeEnergySim.nonbondedCutoff != gpu->sim.nonbondedCutoff ){
-            std::stringstream msg;
-            msg << "The softcore non-bonded cutoff=" << freeEnergyGpu->freeEnergySim.nonbondedCutoff;
-            msg << "does not agree with the non-softcore cutoff= " << gpu->sim.nonbondedCutoff;
-            throw OpenMM::OpenMMException( msg.str() );
-        }
-/*
-        freeEnergyGpuBuildOutputBuffers( freeEnergyGpu, getHasFreeEnergyGeneralizedKirkwood() );
-        freeEnergyGpuBuildThreadBlockWorkList( freeEnergyGpu );
-
-        boxDimensions[0] = freeEnergyGpu->gpuContext->sim.periodicBoxSizeX;
-        boxDimensions[1] = freeEnergyGpu->gpuContext->sim.periodicBoxSizeY;
-        boxDimensions[2] = freeEnergyGpu->gpuContext->sim.periodicBoxSizeZ;
-*/
-        gpuBuildExclusionList( gpu );
-        gpuSetConstants( gpu );
-        freeEnergyGpuSetConstants( freeEnergyGpu );
-        gpuInitialized   = true;
-
-        if( log ){
-            //gpuPrintCudaFreeEnergyGmxSimulation( freeEnergyGpu, getLog() );
-            (void) fprintf( log, "FreeEnergyCudaGpu initialized kernelCount=%d\n", kernelCount );
-            (void) fflush( log );
-        }
-
-    } else {
-/*
-        if( boxDimensions[0] != freeEnergyGpu->gpuContext->sim.periodicBoxSizeX ||
-            boxDimensions[1] != freeEnergyGpu->gpuContext->sim.periodicBoxSizeY ||
-            boxDimensions[2] != freeEnergyGpu->gpuContext->sim.periodicBoxSizeZ ){
-            freeEnergyGpuSetConstants( freeEnergyGpu, 1 );
-            
-            boxDimensions[0] = freeEnergyGpu->gpuContext->sim.periodicBoxSizeX;
-            boxDimensions[1] = freeEnergyGpu->gpuContext->sim.periodicBoxSizeY;
-            boxDimensions[2] = freeEnergyGpu->gpuContext->sim.periodicBoxSizeZ;
-        }
-*/
-
-    }
-
-    return;
-}
-
-}
-
--- a/plugins/freeEnergy/platforms/cuda/src/FreeEnergyCudaData.h
+++ b/plugins/freeEnergy/platforms/cuda/src/FreeEnergyCudaData.h
-#ifndef FREE_ENERGY_CUDA_DATA_H_
-#define FREE_ENERGY_CUDA_DATA_H_
-
-/* -------------------------------------------------------------------------- *
- *                              OpenMMFreeEnergy                                  *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
- * Authors:                                                                   *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "CudaPlatform.h"
-#include "kernels/freeEnergyGpuTypes.h"
-#include "kernels/cudaKernels.h"
-#include "openmm/KernelImpl.h"
-
-namespace OpenMM {
-
-/**
- * Free energy Cuda data
- */
-class FreeEnergyCudaData {
-
-public:
-
-    FreeEnergyCudaData( CudaPlatform::PlatformData& data );
-    ~FreeEnergyCudaData();
-
-    /**
-     * Increment kernel count
-     * 
-     */
-    void incrementKernelCount( void );
-
-    /**
-     * Decrement kernel count
-     * 
-     */
-    void decrementKernelCount( void );
-
-    /**
-     * Return freeEnergyGpuContext context
-     * 
-     * @return freeEnergyGpuContext
-     */
-    freeEnergyGpuContext OPENMMCUDA_EXPORT getFreeEnergyGpu( void ) const;
-
-    /**
-     * Set log file reference
-     * 
-     * @param log file reference; if not set, then no logging
-     */
-    void setLog( FILE* inputLog );
-
-    /**
-     * Get log file reference
-     * 
-     * @return log file reference
-     */
-    FILE* getLog( void ) const;
-
-    /**
-     * if gpuInitialized is false, write data to board
-     * 
-     * @param log file reference; if not set, then no logging
-     */
-    void initializeGpu( void ); 
-
-    /**
-     * Set contextImpl
-     * 
-     * @param contextImpl reference
-     */
-    void setContextImpl( void* contextImpl ); 
-
-    CudaPlatform::PlatformData& cudaPlatformData;
-
-private:
-
-    freeEnergyGpuContext freeEnergyGpu;
-    unsigned int kernelCount;
-    void* contextImpl;
-    FILE* log;
-    bool gpuInitialized;
-    double boxDimensions[3];
-};
-
-} // namespace OpenMM
-
-#endif /*FREE_ENERGY_CUDA_DATA_H_*/
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/GpuFreeEnergyCudaKernels.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/GpuFreeEnergyCudaKernels.h
-#ifndef __GPU_FREE_ENERGY_KERNELS_H__
-#define __GPU_FREE_ENERGY_KERNELS_H__
-
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "gputypes.h"
-#include "freeEnergyGpuTypes.h"
-#include "cudatypes.h"
-#include <vector>
-#include <cuda.h>
-
-// Function prototypes
-
-// CDLJ softcore
-
-// setup methods called from CudaFreeEnergyKernels
-// nonbonded and 1-4 ixns
-
-extern "C" bool gpuIsAvailableSoftcore();
-
-extern "C" void freeEnergyGpuSetPeriodicBoxSize( freeEnergyGpuContext gpu, float xsize, float ysize, float zsize);
-
-extern "C" void gpuSetNonbondedSoftcoreParameters( freeEnergyGpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6,
-                                                   const std::vector<float>& c12, const std::vector<float>& q,
-                                                   const std::vector<float>& softcoreLJLambdaArray, const std::vector<char>& symbol,
-                                                   const std::vector<std::vector<int> >& exclusions, CudaFreeEnergyNonbondedMethod method,
-                                                   float cutoffDistance, float reactionFieldDielectric);
-
-extern "C" void gpuSetLJ14SoftcoreParameters( freeEnergyGpuContext gpu, float epsfac, const std::vector<int>& atom1,
-                                              const std::vector<int>& atom2, const std::vector<float>& c6, const std::vector<float>& c12,
-                                              const std::vector<float>& qProd, const std::vector<float>& softcoreLJLambdaArray);
-
-// write address's to device
-
-extern "C" void SetCalculateCDLJSoftcoreGpuSim( freeEnergyGpuContext gpu );
-
-extern "C" void SetCalculateLocalSoftcoreGpuSim( freeEnergyGpuContext gpu );
-
-// kernel calls to device
-
-extern "C" void kCalculateCDLJSoftcoreForces( freeEnergyGpuContext gpu );
-
-extern "C" void kCalculateLocalSoftcoreForces( freeEnergyGpuContext gpu );
-
-// GB/VI softcore
-
-// setup method called from CudaFreeEnergyKernels
-
-extern "C" void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius, 
-                                              const std::vector<float>& gamma, const std::vector<float>& scaledRadii,
-                                              const std::vector<float>& bornRadiusScaleFactors, const std::vector<float>& quinticSplineParameters);
-
-// write address's to device
-
-extern "C" void SetCalculateGBVISoftcoreForcesSim( gpuContext gpu, float* softCoreLJLambda);
-extern "C" void SetCalculateGBVISoftcoreBornSumGpuSim( freeEnergyGpuContext gpu );
-extern "C" void SetCalculateGBVISoftcoreForces2Sim( freeEnergyGpuContext gpu);
-
-// kernel calls to device
-
-extern void kReduceGBVIBornSumQuinticScaling( freeEnergyGpuContext gpu );
-extern void kCalculateGBVISoftcoreBornSum( freeEnergyGpuContext gpu );
-extern void kReduceGBVIBornForcesQuinticScaling( freeEnergyGpuContext gpu );
-extern void kCalculateGBVISoftcoreForces2( freeEnergyGpuContext gpu );
-extern void kReduceGBVISoftcoreBornForces( freeEnergyGpuContext gpu);
-extern void kReduceGBVISoftcoreBornSum( freeEnergyGpuContext gpu);
-extern void kPrintGBVISoftcore( freeEnergyGpuContext gpu, std::string callId, int call, FILE* log);
-
-extern void kClearSoftcoreBornForces(gpuContext gpu);
-
-// Obc softcore
-
-// setup method called from CudaFreeEnergyKernels
-
-/** 
- * Initialize parameters for Cuda Obc softcore
- * 
- * @param gpu                  gpu context
- * @param innerDielectric      solute dielectric
- * @param solventDielectric    solvent dielectric
- * @param radius               intrinsic Born radii
- * @param scale                Obc scaling factors
- * @param charge               atomic charges (possibly overwritten by other methods?)
- * @param nonPolarScalingFactors non-polar scaling factors
- *
- */
-
-extern "C" void gpuSetObcSoftcoreParameters( freeEnergyGpuContext gpu, float innerDielectric, float solventDielectric, float nonPolarPrefactor,
-                                             const std::vector<float>& radius, const std::vector<float>& scale,
-                                             const std::vector<float>& charge, const std::vector<float>& nonPolarScalingFactors);
-
-// write address's to device
-
-extern "C" void SetCalculateObcGbsaSoftcoreBornSumSim( freeEnergyGpuContext gpu );
-
-// this method and kCalculateObcGbsaSoftcoreForces2() are being
-// used until changes in OpenMM version are made 
-extern "C" void SetCalculateObcGbsaSoftcoreForces2Sim( freeEnergyGpuContext gpu );
-
-// kernel calls to device
-
-extern void kClearObcGbsaSoftcoreBornSum( gpuContext gpu );
-extern void kReduceObcGbsaSoftcoreBornForces( gpuContext gpu );
-extern void kCalculateObcGbsaSoftcoreBornSum( freeEnergyGpuContext gpu );
-extern void kReduceObcGbsaSoftcoreBornSum( gpuContext gpu );
-
-// this method is not needed; the OpenMM version can be used
-extern void kCalculateObcGbsaSoftcoreForces2( freeEnergyGpuContext gpu );
-extern void kPrintObcGbsaSoftcore( freeEnergyGpuContext gpu, std::string callId, int call, FILE* log);
-
-// shared
-
-extern "C" void SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( freeEnergyGpuContext gpu );
-extern void kCalculateCDLJObcGbsaSoftcoreForces1( freeEnergyGpuContext gpu );
-
-extern "C" void showWorkUnitsFreeEnergy( freeEnergyGpuContext freeEnergyGpu, int interactingWorkUnit );
-
-#endif //__GPU_FREE_ENERGY_KERNELS_H__
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/freeEnergyCudaGpu.cpp
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/freeEnergyCudaGpu.cpp
-/* -------------------------------------------------------------------------- *
- *                               OpenMMFreeEnergy                             *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman, Mark Friedrichs                    *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-
-#ifdef WIN32
-  #define _USE_MATH_DEFINES /* M_PI */
-#endif
-
-#define PARAMETER_PRINT 1
-#define MAX_PARAMETER_PRINT 10
-
-#include "openmm/OpenMMException.h"
-#include "cudaKernels.h"
-#include "GpuFreeEnergyCudaKernels.h"
-#include "freeEnergyGpuTypes.h"
-
-// for some reason, these are not being included w/ cudaKernels.h on Windows
-//extern void OPENMMCUDA_EXPORT SetCalculateObcGbsaForces2Sim(gpuContext gpu);
-extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu);
-
-#include <cmath>
-#include <iostream>
-#include <sstream>
-#include <limits>
-#include <cstring>
-#include <vector>
-#include <stdio.h>
-
-#ifdef WIN32
-#include <windows.h>
-#else
-#include <sys/time.h>
-#endif
-
-using std::vector;
-
-extern "C"
-freeEnergyGpuContext freeEnergyGpuInit( _gpuContext* gpu ){
-
-    // allocate and zero block
-
-    freeEnergyGpuContext freeEnergyGpu             = new _freeEnergyGpuContext;
-    memset( freeEnergyGpu, 0, sizeof( struct _freeEnergyGpuContext ) );
-
-    freeEnergyGpu->gpuContext                      = gpu; 
-
-    return freeEnergyGpu;
-}
-
-extern "C"
-void gpuPrintCudaStream( std::string name,
-                         unsigned int length, unsigned int subStreams, unsigned int stride,
-                         unsigned int memoryFootprint,
-                         void*  pSysStream, void* pDevStream,
-                         void*  pSysData,   void* pDevData, FILE* log)
-{
-   
-    (void) fprintf( log, "     %-35s [%8u %5u %8u %8u] Stream[%p %p] Data[%16p %16p]\n",
-                    name.c_str(), length, subStreams,
-                    stride, memoryFootprint, pSysStream, pDevStream, pSysData, pDevData );
-}
-
-extern "C"
-int gpuPrintCudaStreamFloat( CUDAStream<float>* cUDAStream, FILE* log )
-{
-   
-    if( cUDAStream == NULL )return 0;
-    gpuPrintCudaStream( cUDAStream->_name.c_str(),
-                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
-                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( float ),
-                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
-                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
-    return cUDAStream->_length*cUDAStream->_subStreams*sizeof( float );
-}
-
-extern "C"
-int gpuPrintCudaStreamFloat2( CUDAStream<float2>* cUDAStream, FILE* log )
-{
-   
-    if( cUDAStream == NULL )return 0;
-    gpuPrintCudaStream( cUDAStream->_name.c_str(),
-                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
-                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( float2 ),
-                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
-                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
-    return cUDAStream->_length*cUDAStream->_subStreams*2*sizeof( float );
-}
-
-extern "C"
-int gpuPrintCudaStreamFloat4( CUDAStream<float4>* cUDAStream, FILE* log )
-{
-   
-    if( cUDAStream == NULL )return 0;
-    gpuPrintCudaStream( cUDAStream->_name.c_str(),
-                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
-                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( float4 ),
-                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
-                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
-    return cUDAStream->_length*cUDAStream->_subStreams*4*sizeof( float );
-}
-
-extern "C"
-int gpuPrintCudaStreamUnsignedInt( CUDAStream<unsigned int>* cUDAStream, FILE* log )
-{
-   
-    if( cUDAStream == NULL )return 0;
-    gpuPrintCudaStream( cUDAStream->_name.c_str(),
-                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
-                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( unsigned int ),
-                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
-                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
-    return cUDAStream->_length*cUDAStream->_subStreams*sizeof( unsigned int );
-}
-
-extern "C"
-int gpuPrintCudaStreamInt( CUDAStream<int>* cUDAStream, FILE* log )
-{
-   
-    if( cUDAStream == NULL )return 0;
-    gpuPrintCudaStream( cUDAStream->_name.c_str(),
-                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
-                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( int ),
-                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
-                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
-    return cUDAStream->_length*cUDAStream->_subStreams*sizeof( int );
-}
-
-extern "C"
-int gpuPrintCudaStreamInt2( CUDAStream<int2>* cUDAStream, FILE* log )
-{
-   
-    if( cUDAStream == NULL )return 0;
-    gpuPrintCudaStream( cUDAStream->_name.c_str(),
-                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
-                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( int2 ),
-                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
-                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
-    return cUDAStream->_length*cUDAStream->_subStreams*2*sizeof( int );
-}
-
-extern "C"
-int gpuPrintCudaStreamInt4( CUDAStream<int4>* cUDAStream, FILE* log )
-{
-   
-    if( cUDAStream == NULL )return 0;
-    gpuPrintCudaStream( cUDAStream->_name.c_str(),
-                        cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
-                        cUDAStream->_length*cUDAStream->_subStreams*sizeof( int4 ),
-                        static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
-                        static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
-    return cUDAStream->_length*cUDAStream->_subStreams*4*sizeof( int );
-}
-
-extern "C"
-void gpuPrintCudaFreeEnergyGmxSimulation(freeEnergyGpuContext freeEnergyGpu, FILE* log )
-{
-
-    if( log == NULL )return;
-
-    _gpuContext* gpu                            = freeEnergyGpu->gpuContext;
-    int totalMemory                             = 0;
-
-    (void) fprintf( log, "cudaFreeEnergyGmxSimulation:\n\n" );
-
-    (void) fprintf( log, "\n" );
-    (void) fprintf( log, "     numberOfAtoms                      %u\n",      gpu->natoms );
-    (void) fprintf( log, "     paddedNumberOfAtoms                %u\n",      gpu->sim.paddedNumberOfAtoms );
-
-
-    (void) fprintf( log, "\n\n" );
-    (void) fprintf( log, "     gpuContext                         %p\n",      freeEnergyGpu->gpuContext );
-    (void) fprintf( log, "     log                                %p %s\n",   freeEnergyGpu->log, freeEnergyGpu->log == stderr ? "is stderr" : "is not stderr");
-    (void) fprintf( log, "     sm_version                         %u\n",      gpu->sm_version );
-    (void) fprintf( log, "     device                             %u\n",      gpu->device );
-    (void) fprintf( log, "     sharedMemoryPerBlock               %u\n",      gpu->sharedMemoryPerBlock );
-    (void) fprintf( log, "     bOutputBufferPerWarp               %d\n",      gpu->bOutputBufferPerWarp );
-    (void) fprintf( log, "     blocks                             %u\n",      gpu->sim.blocks );
-    (void) fprintf( log, "     threads_per_block                  %u\n",      gpu->sim.threads_per_block);
-    (void) fprintf( log, "     update_threads_per_block           %u\n",      gpu->sim.update_threads_per_block);
-    (void) fprintf( log, "     nonbondBlocks                      %u\n",      gpu->sim.nonbond_blocks );
-    (void) fprintf( log, "     nonbondThreadsPerBlock             %u\n",      gpu->sim.nonbond_threads_per_block);
-    (void) fprintf( log, "     bsf_reduce_threads_per_block       %u\n",      gpu->sim.bsf_reduce_threads_per_block);
-    (void) fprintf( log, "     nonbondOutputBuffers               %u\n",      gpu->sim.nonbondOutputBuffers );
-    (void) fprintf( log, "     outputBuffers                      %u\n",      gpu->sim.outputBuffers );
-
-    totalMemory += gpuPrintCudaStreamFloat(  freeEnergyGpu->gpuContext->psEnergy,    log );
-    totalMemory += gpuPrintCudaStreamFloat4( freeEnergyGpu->gpuContext->psForce4,    log );
-
-    (void) fflush( log );
-}
-
-extern "C"
-void freeEnergyGpuShutDown( freeEnergyGpuContext freeEnergyGpu ){
-
-    if( freeEnergyGpu->log ){
-        (void) fprintf( freeEnergyGpu->log, "freeEnergyGpuShutDown called.\n" );
-        (void) fflush( freeEnergyGpu->log );
-    }
-
-    // free free energy Cuda arrays
-   
-    delete freeEnergyGpu->psLJ14ID;
-    delete freeEnergyGpu->psLJ14Parameter;
-
-    delete freeEnergyGpu->psSigEps4;
-    delete freeEnergyGpu->psSwitchDerivative;
-    delete freeEnergyGpu->psNonPolarScalingFactors;
-
-    delete freeEnergyGpu;
-
-    return;
-}
-
-extern "C"
-void freeEnergyGpuSetConstants( freeEnergyGpuContext freeEnergyGpu ){
-
-    if( freeEnergyGpu->log ){
-        (void) fprintf( freeEnergyGpu->log, "FreeEnergyGpuSetConstants called\n" );
-        (void) fflush( freeEnergyGpu->log );
-    }
-
-    SetCalculateLocalSoftcoreGpuSim( freeEnergyGpu );
-    SetCalculateCDLJSoftcoreGpuSim( freeEnergyGpu );
-    SetCalculateGBVISoftcoreBornSumGpuSim( freeEnergyGpu );
-    SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( freeEnergyGpu );
-    SetCalculateGBVISoftcoreForces2Sim( freeEnergyGpu );
-    SetCalculateObcGbsaSoftcoreBornSumSim( freeEnergyGpu );
-    SetCalculateObcGbsaSoftcoreForces2Sim( freeEnergyGpu );
-
-}
-
-extern "C"
-void freeEnergyGpuSetPeriodicBoxSize( freeEnergyGpuContext freeEnergyGpu, float xsize, float ysize, float zsize)
-{
-    freeEnergyGpu->freeEnergySim.periodicBoxSizeX    = xsize;
-    freeEnergyGpu->freeEnergySim.periodicBoxSizeY    = ysize;
-    freeEnergyGpu->freeEnergySim.periodicBoxSizeZ    = zsize;
-
-    freeEnergyGpu->freeEnergySim.invPeriodicBoxSizeX = 1.0f/xsize;
-    freeEnergyGpu->freeEnergySim.invPeriodicBoxSizeY = 1.0f/ysize;
-    freeEnergyGpu->freeEnergySim.invPeriodicBoxSizeZ = 1.0f/zsize;
-
-    freeEnergyGpu->freeEnergySim.recipBoxSizeX       = 2.0f*PI/freeEnergyGpu->freeEnergySim.periodicBoxSizeX;
-    freeEnergyGpu->freeEnergySim.recipBoxSizeY       = 2.0f*PI/freeEnergyGpu->freeEnergySim.periodicBoxSizeY;
-    freeEnergyGpu->freeEnergySim.recipBoxSizeZ       = 2.0f*PI/freeEnergyGpu->freeEnergySim.periodicBoxSizeZ;
-
-    freeEnergyGpu->freeEnergySim.cellVolume          = freeEnergyGpu->freeEnergySim.periodicBoxSizeX*freeEnergyGpu->freeEnergySim.periodicBoxSizeY*freeEnergyGpu->freeEnergySim.periodicBoxSizeZ;
-
-    gpuSetPeriodicBoxSize( freeEnergyGpu->gpuContext, xsize, ysize, zsize );
-}
-
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable: 4297)
-#endif
-
-extern "C"
-void gpuSetNonbondedSoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6,
-                                        const std::vector<float>& c12, const std::vector<float>& q,
-                                        const std::vector<float>& softcoreLJLambdaArray, const std::vector<char>& symbol,
-                                        const std::vector<std::vector<int> >& exclusions, CudaFreeEnergyNonbondedMethod method,
-                                        float cutoffDistance, float solventDielectric ) {
-
-    unsigned int numberOfParticles                         = c6.size();
-    gpuContext gpu                                         = freeEnergyGpu->gpuContext;
-    int paddedNumberOfAtoms                                = gpu->sim.paddedNumberOfAtoms;
-
-    // sanity checks
-
-    if( paddedNumberOfAtoms < 1 ){
-        std::stringstream msg;
-        msg << "gpuSetNonbondedSoftcoreParameters: number of padded atoms=" <<  gpu->sim.paddedNumberOfAtoms << " is less than 1.";
-        throw OpenMM::OpenMMException( msg.str() );
-    }
-
-    if( freeEnergyGpu->gpuContext->sim.atoms != numberOfParticles  ){
-        std::stringstream msg;
-        msg << "gpuSetNonbondedSoftcoreParameters: number of atoms in gpuContext does not match input count: " << freeEnergyGpu->gpuContext->sim.atoms << " " << numberOfParticles << ".";
-        throw OpenMM::OpenMMException( msg.str() );
-    }
-
-#if defined(_MSC_VER)
-#pragma warning(pop)
-#endif
-
-    freeEnergyGpu->freeEnergySim.epsfac                    = epsfac;
-    freeEnergyGpu->freeEnergySim.nonbondedMethod           = method;
-
-    freeEnergyGpu->freeEnergySim.nonbondedCutoff           = cutoffDistance;
-    freeEnergyGpu->freeEnergySim.nonbondedCutoffSqr        = cutoffDistance*cutoffDistance;
-
-    gpu->sim.nonbondedCutoff                               = cutoffDistance;
-    gpu->sim.nonbondedCutoffSqr                            = cutoffDistance*cutoffDistance;
-
-    if( cutoffDistance > 0.0f ){
-        freeEnergyGpu->freeEnergySim.reactionFieldK        = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0f)/(2.0f*solventDielectric+1.0f);
-        freeEnergyGpu->freeEnergySim.reactionFieldC        = (1.0f / cutoffDistance)*(3.0f*solventDielectric)/(2.0f*solventDielectric+1.0f);
-        gpu->sim.reactionFieldK                            = freeEnergyGpu->freeEnergySim.reactionFieldK;
-        gpu->sim.reactionFieldC                            = freeEnergyGpu->freeEnergySim.reactionFieldC;
-    } else {
-        freeEnergyGpu->freeEnergySim.reactionFieldK        = 0.0f;
-        freeEnergyGpu->freeEnergySim.reactionFieldC        = 0.0f;
-    }
-
-    setExclusions( gpu, exclusions );
- 
-    // parameters
- 
-    freeEnergyGpu->psSigEps4                               = new CUDAStream<float4>( paddedNumberOfAtoms, 1, "freeEnergyGpuSigEps4");
-    freeEnergyGpu->freeEnergySim.pSigEps4                  = freeEnergyGpu->psSigEps4->_pDevData;
-
-    for( unsigned int ii = 0; ii < numberOfParticles; ii++ ){
-
-        float p1 = 0.5f;
-        float p2 = 0.0f;               
-
-        if( (c6[ii] > 0.0f) && (c12[ii] > 0.0f) ){
-            p1 = 0.5f * powf(c12[ii] / c6[ii], 1.0f / 6.0f);
-            p2 = c6[ii] * sqrtf(1.0f / c12[ii]);
-        }
-/*
-            if (symbol.size() > 0)
-                freeEnergyGpu->pAtomSymbol[ii] = symbol[ii];
-*/
-
-        (*freeEnergyGpu->psSigEps4)[ii].x        = p1;
-        (*freeEnergyGpu->psSigEps4)[ii].y        = p2;
-        (*freeEnergyGpu->psSigEps4)[ii].z        = softcoreLJLambdaArray[ii];
-        (*freeEnergyGpu->psSigEps4)[ii].w        = q[ii];
-    }
-
-    // Dummy out extra atom data
-
-    for( int ii = numberOfParticles; ii < paddedNumberOfAtoms; ii++ ){
-
-        (*freeEnergyGpu->psSigEps4)[ii].x              = 1.0f;
-        (*freeEnergyGpu->psSigEps4)[ii].y              = 0.0f;
-        (*freeEnergyGpu->psSigEps4)[ii].z              = 0.0f;
-        (*freeEnergyGpu->psSigEps4)[ii].w              = 0.0f;
-
-        (*gpu->psPosq4)[ii].x                          = 100000.0f + ii * 10.0f;
-        (*gpu->psPosq4)[ii].y                          = 100000.0f + ii * 10.0f;
-        (*gpu->psPosq4)[ii].z                          = 100000.0f + ii * 10.0f;
-        (*gpu->psPosq4)[ii].w                          = 0.0f;
-
-    }
-
-    if( freeEnergyGpu->log ){
-        (void) fprintf( freeEnergyGpu->log,"freeEnergyGpuSetNonbondedSoftcoreParameters: %5u padded=%u epsfac=%14.7e method=%d cutoffDistance=%9.2f solventDielectric=%9.2f\n",
-                        numberOfParticles, freeEnergyGpu->gpuContext->sim.paddedNumberOfAtoms, epsfac, method, cutoffDistance, solventDielectric );
-#ifdef PARAMETER_PRINT
-        int maxPrint = MAX_PARAMETER_PRINT;
-        for (unsigned int ii = 0; ii < numberOfParticles; ii++){
-            (void) fprintf( freeEnergyGpu->log,"%6u sig[%14.7e %14.7e] lambda=%10.3f q=%10.3f\n",
-                            ii, 
-                            (*freeEnergyGpu->psSigEps4)[ii].x, (*freeEnergyGpu->psSigEps4)[ii].y, (*freeEnergyGpu->psSigEps4)[ii].z, (*freeEnergyGpu->psSigEps4)[ii].w );
-            if( ii == maxPrint && ii < freeEnergyGpu->gpuContext->sim.paddedNumberOfAtoms - maxPrint ){
-               ii = numberOfParticles - maxPrint;
-            }
-        }
-        unsigned int offset = paddedNumberOfAtoms - maxPrint;
-        if( offset > 0 ){
-            if( offset > numberOfParticles ){
-                (void) fprintf( freeEnergyGpu->log,"Dummy padded entries\n" );
-                for (int ii = offset; ii < paddedNumberOfAtoms; ii++){
-                    (void) fprintf( freeEnergyGpu->log,"%6u sig[%14.7e %14.7e] lambda=%10.3f q=%10.3f\n",
-                                    ii, 
-                                    (*freeEnergyGpu->psSigEps4)[ii].x, (*freeEnergyGpu->psSigEps4)[ii].y, (*freeEnergyGpu->psSigEps4)[ii].z, (*freeEnergyGpu->psSigEps4)[ii].w );
-                }
-            }
-        }
-#endif
-        (void) fflush( freeEnergyGpu->log );
-    }
- 
-    // upload data to board
-
-    freeEnergyGpu->psSigEps4->Upload();
-    gpu->psPosq4->Upload();
-
-    return;
-}
-
-/**---------------------------------------------------------------------------------------
-
-   Get threads/block
-
-   @param amoebaGpu              amoebaGpuContext
-   @param sharedMemoryPerThread  shared memory/thread
-   @param sharedMemoryPerBlock   shared memory/block
-
-   @return threadsPerBlock
-
-   --------------------------------------------------------------------------------------- */
-
-extern "C"
-unsigned int getThreadsPerBlockFEP( freeEnergyGpuContext freeEnergyGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock )
-{
-    unsigned int grid               = freeEnergyGpu->gpuContext->grid;
-    unsigned int threadsPerBlock    = (sharedMemoryPerBlock + grid -1)/(grid*sharedMemoryPerThread);
-    threadsPerBlock                 = threadsPerBlock < 1 ? 1 : threadsPerBlock;
-    threadsPerBlock                *= grid;
-
-   return threadsPerBlock;
-}
-
-static int decodeCell( int cellCode, unsigned int* x, unsigned int* y, unsigned int* exclusion ){
-    *x         =  cellCode >> 17; 
-    *y         = (cellCode >> 2 ) & 0x7FFF;
-    *exclusion = (cellCode & 1) ? 1 : 0;  
-   return 0;
-}
-
-void showWorkUnitsFreeEnergy( freeEnergyGpuContext freeEnergyGpu, int interactingWorkUnit ){
-
-    gpuContext gpu = freeEnergyGpu->gpuContext;
-
-    gpu->psWorkUnit->Download();
-    gpu->psInteractingWorkUnit->Download();
-    gpu->psInteractionFlag->Download();
-
-    unsigned int totalWarps      = (gpu->sim.nonbond_blocks*gpu->sim.nonbond_threads_per_block)/GRID;
-    //unsigned int warp            = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    //unsigned int numWorkUnits    = cSim.pInteractionCount[0];
-    unsigned int numWorkUnits    = gpu->psInteractionCount->_pSysData[0];
-
-    (void) fprintf( stderr, "Total warps=%u blocks=%u threads=%u GRID=%u wus=%u warpOutput=%d\n",
-                    totalWarps, gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, GRID, 
-                    numWorkUnits, gpu->bOutputBufferPerWarp );
-
-    unsigned int maxPrint = 3;
-    std::stringstream message;
-    char buffer[2048];
-    unsigned int targetAtom = 18;
-    for( unsigned int ii = 0; ii < gpu->sim.nonbond_blocks; ii++ )
-    {
-        unsigned int blockId = ii;
-        for( unsigned int jj = 0; jj < gpu->sim.nonbond_threads_per_block; jj++ )
-        {
-            unsigned int warp    = (ii*gpu->sim.nonbond_threads_per_block+jj)/GRID;
-            unsigned int pos     = warp*numWorkUnits/totalWarps;
-            unsigned int end     = (warp+1)*numWorkUnits/totalWarps;
-            unsigned int print   = 0;
-            while( pos < end ){
-                unsigned int x, y, exclusion, flags;
-                int flagInt;
-                if( interactingWorkUnit ){
-                    decodeCell( gpu->psInteractingWorkUnit->_pSysData[pos], &x, &y, &exclusion );
-                    flags = gpu->psInteractionFlag->_pSysData[pos];
-                    if( flags == 0xFFFFFFFF ){
-                        flagInt = -2;
-                    } else {
-                        flagInt = flags;
-                    }
-                } else {
-                    decodeCell( gpu->psWorkUnit->_pSysData[pos], &x, &y, &exclusion );
-                    flagInt = -1;
-                }
-
-                x                   *= GRID;
-                y                   *= GRID;
-                unsigned int bufferX = gpu->bOutputBufferPerWarp ? warp*gpu->sim.stride : (x >> GRIDBITS) * gpu->sim.stride;
-                unsigned int bufferY = gpu->bOutputBufferPerWarp ? warp*gpu->sim.stride : (y >> GRIDBITS) * gpu->sim.stride;
-                bufferX             /= gpu->sim.paddedNumberOfAtoms;
-                bufferY             /= gpu->sim.paddedNumberOfAtoms;
-                if( jj == 1 ){
-                //if( bufferX >= 62 || bufferY >= 62 ){
-                     (void) sprintf( buffer, "Block %4u thread %4u warp=%4u pos[%4u %4u] bufXY[%4u %4u]", ii, jj, warp, pos, end, bufferX, bufferY );
-                     message << buffer;
-                     (void) sprintf( buffer, "    x[%4u %4u] y[%4u %4u] excl=%u", x, x+32, y, y+32, exclusion );
-                     message << buffer;
-                     if( interactingWorkUnit ){
-                         (void) sprintf( buffer, " Flg=%d (-2=all) %u", flagInt, flags );
-                     }
-                     message << buffer;
-                     message << std::endl;
-      
-                }
-                pos++;
-            }
-        }
-    }
-    (void) fprintf( stderr, "%s\n\n", message.str().c_str() );
-
-}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/freeEnergyCudaTypes.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/freeEnergyCudaTypes.h
-#ifndef FREE_ENERGY_CUDA_TYPES_H
-#define FREE_ENERGY_CUDA_TYPES_H
-
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include <kernels/cudatypes.h>
-
-#include <stdarg.h>
-#include <limits>
-#include <iostream>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string>
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <cufft.h>
-#include <builtin_types.h>
-#include <vector_functions.h>
-
-enum CudaFreeEnergyNonbondedMethod
-{
-    FREE_ENERGY_NO_CUTOFF,
-    FREE_ENERGY_CUTOFF,
-    FREE_ENERGY_PERIODIC
-};
-
-struct cudaFreeEnergyGmxSimulation {
-
-    // Constants
-
-    unsigned int   LJ14_count;                      // LJ count
-    int4*          pLJ14ID;                         // LJ 14 particles ids
-    float4*        pLJ14Parameter;                  // LJ 14 parameters 
-
-    float           epsfac;                         // Epsilon factor for CDLJ calculations
-    CudaFreeEnergyNonbondedMethod nonbondedMethod;  // How to handle nonbonded interactions
-    float           nonbondedCutoff;                // Cutoff distance for nonbonded interactions
-    float           nonbondedCutoffSqr;             // Square of the cutoff distance for nonbonded interactions
-
-    float           periodicBoxSizeX;               // The X dimension of the periodic box
-    float           periodicBoxSizeY;               // The Y dimension of the periodic box
-    float           periodicBoxSizeZ;               // The Z dimension of the periodic box
-
-    float           invPeriodicBoxSizeX;            // The 1 over the X dimension of the periodic box
-    float           invPeriodicBoxSizeY;            // The 1 over the Y dimension of the periodic box
-    float           invPeriodicBoxSizeZ;            // The 1 over the Z dimension of the periodic box
-
-    float           recipBoxSizeX;                  // The X dimension of the reciprocal box for Ewald summation
-    float           recipBoxSizeY;                  // The Y dimension of the reciprocal box for Ewald summation
-    float           recipBoxSizeZ;                  // The Z dimension of the reciprocal box for Ewald summation
-
-    float           cellVolume;                     // Ewald parameter alpha (a.k.a. kappa)
-
-    float           reactionFieldK;                 // Constant for reaction field correction
-    float           reactionFieldC;                 // Constant for reaction field correction
-    float4*         pSigEps4;                       // sigma, eps, lambda. charge
-
-    int             bornRadiiScalingMethod;         // flag for method to use scaling radii (0=none,1=quintic spline)
-    float           quinticLowerLimitFactor;        // lower limit factor for quintic spline
-    float           quinticUpperLimit;              // upper limit for quintic spline
-    float*          pSwitchDerivative;              // switch deriviatives for quintic spline
-
-    float*          pNonPolarScalingFactors;        // non-polar scaling factors
-
-};
-
-#endif // FREE_ENERGY_CUDA_TYPES_H
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/freeEnergyGpuTypes.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/freeEnergyGpuTypes.h
-#ifndef __FREE_ENERGY_GPUTYPES_H__
-#define __FREE_ENERGY_GPUTYPES_H__
-
-/* -------------------------------------------------------------------------- *
- *                          OpenMMFreeEnergy                                      *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "kernels/gputypes.h"
-#include "freeEnergyCudaTypes.h"
-
-#include <map>
-typedef std::map<int,float> MapIntFloat;
-typedef MapIntFloat::const_iterator MapIntFloatCI;
-
-struct _freeEnergyGpuContext {
-    
-    _gpuContext* gpuContext;
-    cudaFreeEnergyGmxSimulation freeEnergySim;
-    std::vector<std::vector<int> > exclusions;
-    
-    CUDAStream<float4>* psSigEps4;
-    CUDAStream<int4>*   psLJ14ID;
-    CUDAStream<float4>* psLJ14Parameter;
-    CUDAStream<float>*  psSwitchDerivative;
-    CUDAStream<float>*  psNonPolarScalingFactors;
- 
-    FILE* log;
-};
-
-typedef struct _freeEnergyGpuContext *freeEnergyGpuContext;
-
-// Function prototypes
-
-extern "C" freeEnergyGpuContext freeEnergyGpuInit( _gpuContext* gpu );
-extern "C" void freeEnergyGpuShutDown(freeEnergyGpuContext gpu);
-extern "C" void freeEnergyGpuSetConstants(freeEnergyGpuContext gpu);
-extern "C" unsigned int getThreadsPerBlockFEP( freeEnergyGpuContext freeEnergyGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock );
-
-#endif // __FREE_ENERGY_GPUTYPES_H__
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaSoftcoreForces1.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaSoftcoreForces1.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "kernels/gputypes.h"
-#include "kernels/cudatypes.h"
-#include "kernels/cudaKernels.h"
-#include "GpuFreeEnergyCudaKernels.h"
-#include "openmm/OpenMMException.h"
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <iostream>
-#include <sstream>
-
-#define USE_SOFTCORE_LJ
-
-struct Atom {
-    float x;
-    float y;
-    float z;
-    float q;
-    float sig;
-    float eps;
-    float br;
-    float softCoreLJLambda;
-    float fx;
-    float fy;
-    float fz;
-    float fb;
-};
-
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaFreeEnergyGmxSimulation feSimDev;
-
-void SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( freeEnergyGpuContext freeEnergyGpu ){
-
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &freeEnergyGpu->gpuContext->sim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateCDLJObcGbsaSoftcoreGpu1Sim copy to cSim failed");
-
-    status = cudaMemcpyToSymbol( feSimDev, &freeEnergyGpu->freeEnergySim, sizeof(cudaFreeEnergyGmxSimulation));
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateCDLJObcGbsaSoftcoreGpu1Sim copy to feSimDev failed");
-}
-
-// Include versions of the kernel for N^2 calculations with softcore LJ.
-
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2##b
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_SOFTCORE_LJ
-#include "kCalculateCDLJObcGbsaSoftcoreForces1.h"
-
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateCDLJObcGbsaSoftcoreForces1.h"
-#undef USE_SOFTCORE_LJ
-#undef USE_OUTPUT_BUFFER_PER_WARP
-
-// Include versions of the kernel with cutoffs.
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_CUTOFF
-#define METHOD_NAME(a, b) a##Cutoff##b
-#include "kCalculateCDLJObcGbsaSoftcoreForces1.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##CutoffByWarp##b
-#include "kCalculateCDLJObcGbsaSoftcoreForces1.h"
-
-// Include versions of the kernel with periodic boundary conditions.
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_PERIODIC
-#define METHOD_NAME(a, b) a##Periodic##b
-#include "kCalculateCDLJObcGbsaSoftcoreForces1.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##PeriodicByWarp##b
-#include "kCalculateCDLJObcGbsaSoftcoreForces1.h"
-
-
-/**
- * 
- * Calculate Born radii and first GBSA loop forces/energy
- *
- * @param gpu     gpu context
- *
- */
-void kCalculateCDLJObcGbsaSoftcoreForces1( freeEnergyGpuContext freeEnergyGpu )
-{
-    unsigned int threadsPerBlock;
-    static unsigned int threadsPerBlockPerMethod[3] = { 0, 0, 0 };
-    static unsigned int natoms[3]                   = { 0, 0, 0 };
-    gpuContext gpu                                  = freeEnergyGpu->gpuContext;
-    unsigned int methodIndex                        = static_cast<unsigned int>(freeEnergyGpu->freeEnergySim.nonbondedMethod);
-
-    if( methodIndex > 2 ){
-        throw OpenMM::OpenMMException( "kCalculateCDLJObcGbsaSoftcoreForces1 method index invalid." );
-    }   
-
-    if( natoms[methodIndex] != gpu->natoms ){
-        unsigned int extra                    = methodIndex == 0 ? 0 : sizeof(float);
-        threadsPerBlockPerMethod[methodIndex] = std::min(getThreadsPerBlockFEP( freeEnergyGpu, (sizeof(Atom) + extra), gpu->sharedMemoryPerBlock ), gpu->sim.nonbond_threads_per_block );
-        natoms[methodIndex]                   = gpu->natoms;
-    }   
-    threadsPerBlock = threadsPerBlockPerMethod[methodIndex];
-
-    switch( freeEnergyGpu->freeEnergySim.nonbondedMethod )
-    {
-        case FREE_ENERGY_NO_CUTOFF:
-
-            // use softcore LJ potential
-            if (gpu->bOutputBufferPerWarp)
-                   kCalculateCDLJObcGbsaSoftcoreN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
-                           sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit );
-            else
-                   kCalculateCDLJObcGbsaSoftcoreN2Forces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
-                           sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit );
-   
-            LAUNCHERROR("kCalculateCDLJObcGbsaSoftcoreForces1");
-            break;
-
-        case FREE_ENERGY_CUTOFF:
-
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJObcGbsaSoftcoreCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
-                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
-            else
-                kCalculateCDLJObcGbsaSoftcoreCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
-                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
-
-            LAUNCHERROR("kCalculateCDLJObcGbsaSoftcoreCutoffForces1");
-
-            break;
-
-        case FREE_ENERGY_PERIODIC:
-
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJObcGbsaSoftcorePeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
-                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateCDLJObcGbsaSoftcorePeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
-                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
-
-            LAUNCHERROR("kCalculateCDLJObcGbsaSoftcorePeriodicForces1");
-            break;
-    }
-
-}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaSoftcoreForces1.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaSoftcoreForces1.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-/**
- * This file contains the kernel for evalauating nonbonded forces and the first stage of GBSA.
- * It is included several times in kCalculateCDLJObcGbsaForces1.cu with different #defines to generate
- * different versions of the kernels.
- */
-
-#define USE_SOFTCORE_LJ
-
-#ifdef USE_SOFTCORE_LJ
-#include "kSoftcoreLJ.h"
-#endif
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
-#endif
-#ifdef DEBUG
-void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* workUnit, float4* pdE1, float4* pdE2 )
-#else
-void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* workUnit )
-#endif
-{
-    extern __shared__ Atom sA[];
-    unsigned int totalWarps        = gridDim.x*blockDim.x/GRID;
-    unsigned int warp              = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-
-    unsigned int numWorkUnits      = cSim.pInteractionCount[0];
-    unsigned int pos               = warp*numWorkUnits/totalWarps;
-    unsigned int end               = (warp+1)*numWorkUnits/totalWarps;
-    float CDLJObcGbsa_energy;
-    float energy                   = 0.0f;
-#ifdef USE_CUTOFF
-    float* tempBuffer              = (float*) &sA[blockDim.x];
-#endif
-
-    unsigned int lasty             = -0xFFFFFFFF;
-    while (pos < end)
-    {
-
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x                      = workUnit[pos];
-        unsigned int y                      = ((x >> 2) & 0x7fff) << GRIDBITS;
-        bool bExclusionFlag                 = (x & 0x1);
-        x                                   = (x >> 17) << GRIDBITS;
-        unsigned int tgx                    = threadIdx.x & (GRID - 1);
-        unsigned int i                      = x + tgx;
-        float4 apos                         = cSim.pPosq[i];
-        float4 a                            = feSimDev.pSigEps4[i];
-        float  softCoreLJLambda             = a.z;
-        float br                            = cSim.pBornRadii[i];
-        unsigned int tbx                    = threadIdx.x - tgx;
-        unsigned int tj                     = tgx;
-        Atom* psA                           = &sA[tbx];
-
-        float4 af;
-        af.x                                = 0.0f;
-        af.y                                = 0.0f;
-        af.z                                = 0.0f;
-        af.w                                = 0.0f;
-
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        {
-            // Read fixed atom data into registers and GRF
-
-            sA[threadIdx.x].x                    = apos.x;
-            sA[threadIdx.x].y                    = apos.y;
-            sA[threadIdx.x].z                    = apos.z;
-
-            sA[threadIdx.x].q                    = a.w;
-            sA[threadIdx.x].sig                  = a.x;
-            sA[threadIdx.x].eps                  = a.y;
-            sA[threadIdx.x].br                   = br;
-            sA[threadIdx.x].softCoreLJLambda     = softCoreLJLambda;
-
-            float q2                             = cSim.preFactor*a.w;
-            a.w                                 *= cSim.epsfac;
-
-            if (!bExclusionFlag)
-            {
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float dx                = psA[j].x - apos.x;
-                    float dy                = psA[j].y - apos.y;
-                    float dz                = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx                     -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy                     -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz                     -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    float r2                = dx * dx + dy * dy + dz * dz;
-                    float invR              = 1.0f / sqrtf(r2);
-                    float sig               = a.x + psA[j].sig;
-                    float eps               = a.y * psA[j].eps;
-#ifdef USE_SOFTCORE_LJ
-                    float dEdR              = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJObcGbsa_energy );
-#else
-
-                    // CDLJ part
-                    float sig2              = invR * sig;
-                    sig2                   *= sig2;
-                    float sig6              = sig2 * sig2 * sig2;
-                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
-		              CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
-#endif
-#ifdef USE_CUTOFF
-                    dEdR                   += a.w * psA[j].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
-                    CDLJObcGbsa_energy     += a.w * psA[j].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
-#else
-
-                    float factorX           = a.w * psA[j].q * invR;
-                    dEdR                   += factorX;
-                    CDLJObcGbsa_energy     += factorX;
-
-#endif
-                    dEdR                   *= invR * invR;
-
-                    // ObcGbsaForce1 part
-                    float alpha2_ij         = br * psA[j].br;
-                    float D_ij              = r2 / (4.0f * alpha2_ij);
-                    float expTerm           = expf(-D_ij);
-                    float denominator2      = r2 + alpha2_ij * expTerm;
-                    float denominator       = sqrtf(denominator2);
-                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
-                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
-                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
-		              CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
-#ifdef USE_CUTOFF
-                    if ( i >= cSim.atoms || (x+j) >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                    if ( i >= cSim.atoms || (x+j) >= cSim.atoms)
-#endif
-                    {
-                        dEdR                = 0.0f;
-			               CDLJObcGbsa_energy  = 0.0f;
-                        dGpol_dalpha2_ij    = 0.0f;
-                    }
-                    af.w                   += dGpol_dalpha2_ij * psA[j].br;
-                    energy                 += 0.5f*CDLJObcGbsa_energy;
-
-                    // Add Forces
-
-                    dx                     *= dEdR;
-                    dy                     *= dEdR;
-                    dz                     *= dEdR;
-
-                    af.x                   -= dx;
-                    af.y                   -= dy;
-                    af.z                   -= dz;
-                }
-
-            } else {
-
-                unsigned int xi   = x>>GRIDBITS;
-                unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
-                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
-
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float dx                = psA[j].x - apos.x;
-                    float dy                = psA[j].y - apos.y;
-                    float dz                = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx                     -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy                     -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz                     -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    float r2                = dx * dx + dy * dy + dz * dz;
-                    float invR              = 1.0f / sqrtf(r2);
-                    float sig               = a.x + psA[j].sig;
-                    float eps               = a.y * psA[j].eps;
-
-#ifdef USE_SOFTCORE_LJ
-                    float dEdR              = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJObcGbsa_energy );
-                    //float dEdR              = getSoftCoreLJMod( (invR*sig), eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJObcGbsa_energy );
-#else
-
-                    // CDLJ part
-
-                    float sig2              = invR * sig;
-                    sig2                   *= sig2;
-                    float sig6              = sig2 * sig2 * sig2;
-                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
-		              CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
-#endif
-
-#ifdef USE_CUTOFF
-                    dEdR                   += a.w * psA[j].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
-                    CDLJObcGbsa_energy     += a.w * psA[j].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
-#else
-
-                    float factorX           = a.w * psA[j].q * invR;
-                    dEdR                   += factorX;
-                    CDLJObcGbsa_energy     += factorX;
-
-#endif
-                    dEdR                   *= invR * invR;
-
-                    if (!(excl & 0x1))
-                    {
-                        dEdR                = 0.0f;
-                        CDLJObcGbsa_energy  = 0.0f;
-                    }
-
-                    // ObcGbsaForce1 part
-
-                    float alpha2_ij         = br * psA[j].br;
-                    float D_ij              = r2 / (4.0f * alpha2_ij);
-                    float expTerm           = expf(-D_ij);
-                    float denominator2      = r2 + alpha2_ij * expTerm;
-                    float denominator       = sqrtf(denominator2);
-                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
-                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
-                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
-                    CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
-
-#if defined USE_CUTOFF
-                    if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                    if (i >= cSim.atoms || x+j >= cSim.atoms )
-#endif
-                    {
-                        dEdR               = 0.0f;
-		                  CDLJObcGbsa_energy = 0.0f;
-                        dGpol_dalpha2_ij   = 0.0f;
-                    }
-
-                    af.w                  += dGpol_dalpha2_ij * psA[j].br;
-                    energy                += 0.5f*CDLJObcGbsa_energy;
-                     
-                    // Add Forces
-
-                    dx                     *= dEdR;
-                    dy                     *= dEdR;
-                    dz                     *= dEdR;
-
-                    af.x                   -= dx;
-                    af.y                   -= dy;
-                    af.z                   -= dz;
-
-                    excl                  >>= 1;
-                }
-            }
-
-            // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset         = x + tgx + warp*cSim.stride;
-#else
-            unsigned int offset         = x + tgx + (x >> GRIDBITS) * cSim.stride;
-#endif
-
-            float4 of                   = cSim.pForce4[offset];
-            float  bf                   = cSim.pBornForce[offset];
-            of.x                       += af.x;
-            of.y                       += af.y;
-            of.z                       += af.z;
-            bf                         += af.w;
-            cSim.pForce4[offset]        = of;
-            cSim.pBornForce[offset]     = bf;
-
-        } else { 
-
-            // Read fixed atom data into registers and GRF
-
-            if (lasty != y)
-            {
-                unsigned int j                       = y + tgx;
-                float4 temp                          = cSim.pPosq[j];
-                float4 temp1                         = feSimDev.pSigEps4[j];
-                sA[threadIdx.x].br                   = cSim.pBornRadii[j];
-                sA[threadIdx.x].x                    = temp.x;
-                sA[threadIdx.x].y                    = temp.y;
-                sA[threadIdx.x].z                    = temp.z;
-                sA[threadIdx.x].q                    = temp1.w;
-                sA[threadIdx.x].sig                  = temp1.x;
-                sA[threadIdx.x].eps                  = temp1.y;
-                sA[threadIdx.x].softCoreLJLambda     = temp1.z;
-            }
-
-            sA[threadIdx.x].fx          = 0.0f;
-            sA[threadIdx.x].fy          = 0.0f;
-            sA[threadIdx.x].fz          = 0.0f;
-            sA[threadIdx.x].fb          = 0.0f;
-
-            float q2                    = a.w * cSim.preFactor;
-            a.w                        *= cSim.epsfac;
-            if (!bExclusionFlag)
-            {
-#ifdef USE_CUTOFF
-                unsigned int flags = cSim.pInteractionFlag[pos];
-                if (flags == 0)
-                {
-                    // No interactions in this block.
-                }
-                //else if (flags == 0xFFFFFFFF)
-                else if (flags)
-#endif
-                {
-                    // Compute all interactions within this block.
-
-                    for (unsigned int j = 0; j < GRID; j++)
-                    {
-                        float dx                = psA[tj].x - apos.x;
-                        float dy                = psA[tj].y - apos.y;
-                        float dz                = psA[tj].z - apos.z;
-#ifdef USE_PERIODIC
-                        dx                     -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                        dy                     -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                        dz                     -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                        float r2                = dx * dx + dy * dy + dz * dz;
-                        float invR              = 1.0f / sqrtf(r2);
-
-                        float sig               = a.x + psA[tj].sig;
-                        float eps               = a.y * psA[tj].eps;
-#ifdef USE_SOFTCORE_LJ
-                        float dEdR              = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[tj].softCoreLJLambda, &CDLJObcGbsa_energy );
-#else
-                        // CDLJ part
-                        float sig2              = invR * sig;
-                        sig2                   *= sig2;
-                        float sig6              = sig2 * sig2 * sig2;
-                        float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
-                        CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
-#endif
-#ifdef USE_CUTOFF
-                        dEdR                   += a.w * psA[tj].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
-                        CDLJObcGbsa_energy     += a.w * psA[tj].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
-#else
-
-                        float factorX           = a.w * psA[tj].q * invR;
-                        dEdR                   += factorX;
-                        CDLJObcGbsa_energy     += factorX;
-#endif
-                        dEdR                   *= invR * invR;
-
-                        // ObcGbsaForce1 part
-
-                        float alpha2_ij         = br * psA[tj].br;
-                        float D_ij              = r2 / (4.0f * alpha2_ij);
-                        float expTerm           = expf(-D_ij);
-                        float denominator2      = r2 + alpha2_ij * expTerm;
-                        float denominator       = sqrtf(denominator2);
-                        float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
-                        float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
-                        dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
-                        CDLJObcGbsa_energy     += (q2 * psA[tj].q) / denominator;
-#ifdef USE_CUTOFF
-                        if ( i >= cSim.atoms || (y+tj) >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                        if ( i >= cSim.atoms || (y+tj) >= cSim.atoms)
-#endif
-                        {
-                            dEdR               = 0.0f;
-       			             CDLJObcGbsa_energy = 0.0f;
-                            dGpol_dalpha2_ij   = 0.0f;
-                        }
-                        psA[tj].fb             += dGpol_dalpha2_ij * br;
-                        af.w                   += dGpol_dalpha2_ij * psA[tj].br;
-                        energy                 += CDLJObcGbsa_energy;
-
-                        // Add forces
-
-                        dx                     *= dEdR;
-                        dy                     *= dEdR;
-                        dz                     *= dEdR;
-
-                        af.x                   -= dx;
-                        af.y                   -= dy;
-                        af.z                   -= dz;
-
-                        psA[tj].fx             += dx;
-                        psA[tj].fy             += dy;
-                        psA[tj].fz             += dz;
-
-                        tj                      = (tj + 1) & (GRID - 1);
-                    }
-                }
-#ifdef USE_CUTOFF
-                else {
-
-                    // Compute only a subset of the interactions in this block.
-
-                    for (unsigned int j = 0; j < GRID; j++)
-                    {
-                        if ((flags&(1<<j)) != 0)
-                        {
-                            float dx                = psA[j].x - apos.x;
-                            float dy                = psA[j].y - apos.y;
-                            float dz                = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                            dx                     -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                            dy                     -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                            dz                     -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                            float r2                = dx * dx + dy * dy + dz * dz;
-                            float invR              = 1.0f / sqrtf(r2);
-                            float sig               = a.x + psA[j].sig;
-                            float eps               = a.y * psA[j].eps;
-#ifdef USE_SOFTCORE_LJ
-                            float dEdR              = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJObcGbsa_energy );
-#else
-
-                            // CDLJ part
-                            float sig2              = invR * sig;
-                            sig2                   *= sig2;
-                            float sig6              = sig2 * sig2 * sig2;
-                            float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
-                            CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
-#endif
-#ifdef USE_CUTOFF
-                            dEdR                   += a.w * psA[j].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
-                            CDLJObcGbsa_energy     += a.w * psA[j].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
-#else
-
-                            float factorX           = a.w * psA[j].q * invR;
-                            dEdR                   += factorX;
-                            CDLJObcGbsa_energy     += factorX;
-#endif
-                            dEdR                   *= invR * invR;
-
-                            // ObcGbsaForce1 part
-                            float alpha2_ij         = br * psA[j].br;
-                            float D_ij              = r2 / (4.0f * alpha2_ij);
-                            float expTerm           = expf(-D_ij);
-                            float denominator2      = r2 + alpha2_ij * expTerm;
-                            float denominator       = sqrtf(denominator2);
-                            float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
-                            float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
-                            dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
-                            CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
-
-#ifdef USE_CUTOFF
-                            if ( i >= cSim.atoms || (y+j) >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                            if ( i >= cSim.atoms || (y+j) >= cSim.atoms)
-#endif
-                            {
-                                dEdR                = 0.0f;
-				                    CDLJObcGbsa_energy  = 0.0f;
-                                dGpol_dalpha2_ij    = 0.0f;
-                            }
-                            af.w                   += dGpol_dalpha2_ij * psA[j].br;
-
-                            // Sum the Born forces.
-
-                            tempBuffer[threadIdx.x] = dGpol_dalpha2_ij * br;
-                            if (tgx % 2 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
-                            if (tgx % 4 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
-                            if (tgx % 8 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
-                            if (tgx % 16 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
-                            if (tgx == 0)
-                                psA[j].fb += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
-
-                            energy                 += CDLJObcGbsa_energy;
-
-                            // Add forces
-                            dx                     *= dEdR;
-                            dy                     *= dEdR;
-                            dz                     *= dEdR;
-                            af.x                   -= dx;
-                            af.y                   -= dy;
-                            af.z                   -= dz;
-                            tempBuffer[threadIdx.x] = dx;
-                            if (tgx % 2 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
-                            if (tgx % 4 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
-                            if (tgx % 8 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
-                            if (tgx % 16 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
-                            if (tgx == 0)
-                                psA[j].fx += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
-                            tempBuffer[threadIdx.x] = dy;
-                            if (tgx % 2 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
-                            if (tgx % 4 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
-                            if (tgx % 8 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
-                            if (tgx % 16 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
-                            if (tgx == 0)
-                                psA[j].fy += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
-                            tempBuffer[threadIdx.x] = dz;
-                            if (tgx % 2 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
-                            if (tgx % 4 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
-                            if (tgx % 8 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
-                            if (tgx % 16 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
-                            if (tgx == 0)
-                                psA[j].fz += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
-                        }
-                    }
-                }
-#endif
-            } else {
-
-                unsigned int xi   = x>>GRIDBITS;
-                unsigned int yi   = y>>GRIDBITS;
-                unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
-                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
-                excl              = (excl >> tgx) | (excl << (GRID - tgx));
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float dx                = psA[tj].x - apos.x;
-                    float dy                = psA[tj].y - apos.y;
-                    float dz                = psA[tj].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx                     -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy                     -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz                     -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    float r2                = dx * dx + dy * dy + dz * dz;
-                    float invR              = 1.0f / sqrtf(r2);
-                    float sig               = a.x + psA[tj].sig;
-                    float eps               = a.y * psA[tj].eps;
-#ifdef USE_SOFTCORE_LJ
-                    float dEdR              = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[tj].softCoreLJLambda, &CDLJObcGbsa_energy );
-#else
-
-                    // CDLJ part
-                    float sig2              = invR * sig;
-                    sig2                   *= sig2;
-                    float sig6              = sig2 * sig2 * sig2;
-                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
-		              CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
-#endif
-#ifdef USE_CUTOFF
-                    dEdR                   += a.w * psA[tj].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
-                    CDLJObcGbsa_energy     += a.w * psA[tj].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
-#else
-                    float factorX           = a.w * psA[tj].q * invR;
-                    dEdR                   += factorX;
-                    CDLJObcGbsa_energy     += factorX;
-#endif
-                    dEdR                   *= invR * invR;
-                    if (!(excl & 0x1))
-                    {
-                        dEdR               = 0.0f;
-			               CDLJObcGbsa_energy = 0.0f;
-                    }
-
-                    // ObcGbsaForce1 part
-
-                    float alpha2_ij         = br * psA[tj].br;
-                    float D_ij              = r2 / (4.0f * alpha2_ij);
-                    float expTerm           = expf(-D_ij);
-                    float denominator2      = r2 + alpha2_ij * expTerm;
-                    float denominator       = sqrtf(denominator2);
-                    float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
-                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
-                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
-		              CDLJObcGbsa_energy     += (q2 * psA[tj].q) / denominator;
-#if defined USE_CUTOFF
-                    if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                    if (i >= cSim.atoms || y+tj >= cSim.atoms)
-#endif
-                    {
-                        dEdR               = 0.0f;
-			               CDLJObcGbsa_energy = 0.0f;
-                        dGpol_dalpha2_ij   = 0.0f;
-                    }
-
-                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;
-                    psA[tj].fb             += dGpol_dalpha2_ij * br;
-                    energy                 += CDLJObcGbsa_energy;
-
-                    // Add forces
-
-                    dx                     *= dEdR;
-                    dy                     *= dEdR;
-                    dz                     *= dEdR;
-
-                    af.x                   -= dx;
-                    af.y                   -= dy;
-                    af.z                   -= dz;
-
-                    psA[tj].fx             += dx;
-                    psA[tj].fy             += dy;
-                    psA[tj].fz             += dz;
-
-                    excl                  >>= 1;
-                    tj                      = (tj + 1) & (GRID - 1);
-                }
-            }
-
-            // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset         = x + tgx + warp*cSim.stride;
-#else
-            unsigned int offset         = x + tgx + (y >> GRIDBITS) * cSim.stride;
-#endif
-            float4 of                   = cSim.pForce4[offset];
-            float  bf                   = cSim.pBornForce[offset];
-            of.x                       += af.x;
-            of.y                       += af.y;
-            of.z                       += af.z;
-            bf                         += af.w;
-            cSim.pForce4[offset]        = of;
-            cSim.pBornForce[offset]     = bf;
-
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            offset                      = y + tgx + warp*cSim.stride;
-#else
-            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
-#endif
-            of                          = cSim.pForce4[offset];
-            bf                          = cSim.pBornForce[offset];
-            of.x                       += sA[threadIdx.x].fx;
-            of.y                       += sA[threadIdx.x].fy;
-            of.z                       += sA[threadIdx.x].fz;
-            bf                         += sA[threadIdx.x].fb;
-            cSim.pForce4[offset]        = of;
-            cSim.pBornForce[offset]     = bf;
-
-            lasty                       = y;
-        }
-        pos++;
-    }
-    cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
-}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-
-#include "GpuFreeEnergyCudaKernels.h"
-#include "freeEnergyGpuTypes.h"
-
-#include "openmm/OpenMMException.h"
-#include <cuda.h>
-#include <iostream>
-#include <sstream>
-
-#define PARAMETER_PRINT 0
-#define MAX_PARAMETER_PRINT 10
-
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaFreeEnergyGmxSimulation gbviSimDev;
-
-void SetCalculateGBVISoftcoreBornSumGpuSim( freeEnergyGpuContext freeEnergyGpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol( cSim, &freeEnergyGpu->gpuContext->sim, sizeof(cudaGmxSimulation));    
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateGBVISoftcoreBornSumGpuSim copy to cSim failed");
-
-    status = cudaMemcpyToSymbol( gbviSimDev, &freeEnergyGpu->freeEnergySim, sizeof(cudaFreeEnergyGmxSimulation));    
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateGBVISoftcoreBornSumGpuSim copy to feSim failed");
-}
-
-// create, initialize and enter BornRadiusScaleFactors values (used to scale contribution of atoms to Born sum of other atoms)
-
-extern "C"
-void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom,
-                                   const std::vector<float>& radius, const std::vector<float>& gamma,
-                                   const std::vector<float>& scaledRadii, const std::vector<float>& bornRadiusScaleFactors,
-                                   const std::vector<float>& quinticSplineParameters ){
-
-    unsigned int numberOfParticles                    = radius.size();
-
-    gpuContext gpu                                    = freeEnergyGpu->gpuContext;
-
-    static const float electricConstant               = -166.02691f;
-    double tau                                        = ((1.0f/innerDielectric)-(1.0f/solventDielectric)); 
-    freeEnergyGpu->psSwitchDerivative                 = new CUDAStream<float>( gpu->sim.paddedNumberOfAtoms, 1, "SwitchDerivative");
-    freeEnergyGpu->freeEnergySim.pSwitchDerivative    = freeEnergyGpu->psSwitchDerivative->_pDevData;
-
-    // create gpuGBVISoftcore, load parameters, and track minimum softcore value
-    // gpuGBVISoftcore is not really being used (it was in the initial implementation) -- 
-    // will be removed in future once confirmed not needed
-
-
-    // check if quintic scaling to be applied
-
-    if( quinticSplineParameters.size() == 2 ){
-       freeEnergyGpu->freeEnergySim.bornRadiiScalingMethod        = 1;
-       freeEnergyGpu->freeEnergySim.quinticLowerLimitFactor       = quinticSplineParameters[0];
-       freeEnergyGpu->freeEnergySim.quinticUpperLimit             = quinticSplineParameters[1];
-    } else {
-       freeEnergyGpu->freeEnergySim.bornRadiiScalingMethod        = 0;
-       freeEnergyGpu->freeEnergySim.quinticLowerLimitFactor       = 0.8f;
-       freeEnergyGpu->freeEnergySim.quinticUpperLimit             = 5.0f;
-    }
-
-    for( unsigned int ii = 0; ii < bornRadiusScaleFactors.size(); ii++ ){
-            (*gpu->psGBVIData)[ii].x                 = radius[ii];
-            (*gpu->psGBVIData)[ii].y                 = scaledRadii[ii];
-            (*gpu->psGBVIData)[ii].z                 = tau*gamma[ii];
-            (*gpu->psGBVIData)[ii].w                 = bornRadiusScaleFactors[ii];
-            (*gpu->psBornRadii)[ii]                  = 0.0f;
-            (*freeEnergyGpu->psSwitchDerivative)[ii] = 0.0f;
-    }
-
-    // Dummy out extra atom data
-
-    for( unsigned int ii = bornRadiusScaleFactors.size(); ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
-        (*gpu->psGBVIData)[ii].x                 = 0.01f;
-        (*gpu->psGBVIData)[ii].y                 = 0.01f;
-        (*gpu->psGBVIData)[ii].z                 = 0.0f;
-        (*gpu->psGBVIData)[ii].w                 = 0.0f;
-        (*gpu->psBornRadii)[ii]                  = 0.0f;
-        (*freeEnergyGpu->psSwitchDerivative)[ii] = 0.0f;
-    }
-
-    gpu->sim.preFactor               = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
-
-    // diagnostics
-
-    if( freeEnergyGpu->log ){
-        (void) fprintf( freeEnergyGpu->log,"GBVISoftcore: part.=%u padded=%u sclMeth=%d\n",
-                        static_cast<unsigned int>(bornRadiusScaleFactors.size()), static_cast<unsigned int>(gpu->sim.paddedNumberOfAtoms),
-                        freeEnergyGpu->freeEnergySim.bornRadiiScalingMethod );
-        if( quinticSplineParameters.size() == 2 ){
-            (void) fprintf( freeEnergyGpu->log,"QuinticScaling: LwFct=%8.3f UpLmt=[%12.5e (nm) %12.5e]\n",
-                        freeEnergyGpu->freeEnergySim.quinticLowerLimitFactor,
-                        powf( freeEnergyGpu->freeEnergySim.quinticUpperLimit, -0.3333333f ), freeEnergyGpu->freeEnergySim.quinticUpperLimit );
-        }
-        (void) fprintf( freeEnergyGpu->log, "gpuSetGBVISoftcoreParameters: preFactor=%14.6e elecCnstnt=%.4f frcCnvrsnFctr=%.4f tau=%.4f.\n",
-                        gpu->sim.preFactor, 2.0f*electricConstant, gpu->sim.forceConversionFactor, ((1.0f/innerDielectric)-(1.0f/solventDielectric)) );
-#ifdef PARAMETER_PRINT
-        int maxPrint = MAX_PARAMETER_PRINT;
-        (void) fprintf( freeEnergyGpu->log, "               radius  scaled radius      tau*gamma         lambda\n" );
-        for( unsigned int ii = 0; ii < bornRadiusScaleFactors.size(); ii++ ){
-    
-            (void) fprintf( freeEnergyGpu->log,"%6u %14.7e %14.7e %14.7e %14.7e\n",
-                            ii, (*gpu->psGBVIData)[ii].x, (*gpu->psGBVIData)[ii].y, (*gpu->psGBVIData)[ii].z, (*gpu->psGBVIData)[ii].w ); 
-            if( ii == maxPrint ){
-                ii = bornRadiusScaleFactors.size() - maxPrint;
-                if( ii < maxPrint )ii = maxPrint;
-            }
-        }
-        unsigned int offset = gpu->sim.paddedNumberOfAtoms - MAX_PARAMETER_PRINT;
-        if( offset > 0 && gpu->sim.paddedNumberOfAtoms > bornRadiusScaleFactors.size()  ){
-            for( unsigned int ii = offset; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
-                (void) fprintf( freeEnergyGpu->log,"%6u %14.7e %14.7e %14.7e %14.7e\n",
-                                ii, (*gpu->psGBVIData)[ii].x, (*gpu->psGBVIData)[ii].y, (*gpu->psGBVIData)[ii].z, (*gpu->psGBVIData)[ii].w ); 
-            }
-        }
-#endif
-    }
-
-    gpu->psGBVIData->Upload();
-    gpu->psBornRadii->Upload();
-    freeEnergyGpu->psSwitchDerivative->Upload();
-
-    return;
-}
-
-struct Atom {
-    float x;
-    float y;
-    float z;
-    float r;
-    float sr;
-    float sum;
-    float gamma;
-    float bornRadiusScaleFactor;
-};
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void kClearGBVISoftcoreBornSum_kernel()
-{
-    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    while (pos < cSim.stride * cSim.nonbondOutputBuffers)
-    {   
-        ((float*)cSim.pBornSum)[pos]  = 0.0f;
-        pos                          += gridDim.x * blockDim.x;
-    }   
-}
-
-void kClearGBVISoftcoreBornSum(gpuContext gpu) {
-    kClearGBVISoftcoreBornSum_kernel<<<gpu->sim.blocks, gpu->sim.threads_per_block>>>();
-}
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void kReduceGBVISoftcoreBornForces_kernel()
-{
-    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
-    float energy     = 0.0f;
-    while (pos < cSim.atoms)
-    {
-        float bornRadius  = cSim.pBornRadii[pos];
-        float4 gbviData   = cSim.pGBVIData[pos];
-        float totalForce  = 0.0f;
-        float* pFt        = cSim.pBornForce + pos;
-
-        int i = cSim.nonbondOutputBuffers;
-        while (i >= 4)
-        {
-            float f1    = *pFt;
-            pFt        += cSim.stride;
-            float f2    = *pFt;
-            pFt        += cSim.stride;
-            float f3    = *pFt;
-            pFt        += cSim.stride;
-            float f4    = *pFt;
-            pFt        += cSim.stride;
-            totalForce += f1 + f2 + f3 + f4;
-            i -= 4;
-        }
-        if (i >= 2)
-        {
-            float f1    = *pFt;
-            pFt        += cSim.stride;
-            float f2    = *pFt;
-            pFt        += cSim.stride;
-            totalForce += f1 + f2;
-            i -= 2;
-        }
-        if (i > 0)
-        {
-            totalForce += *pFt;
-        }
-
-        float ratio         = (gbviData.x/bornRadius);
-        float ratio3        = ratio*ratio*ratio;
-
-        energy             -= gbviData.z*ratio3;                   //  gbviData.z = gamma*tau
-
-        totalForce         += (3.0f*gbviData.z*ratio3)/bornRadius; // 'cavity' term
-        float br2           = bornRadius*bornRadius;
-        totalForce         *= (1.0f/3.0f)*br2*br2;
-
-        pFt                 = cSim.pBornForce + pos;
-        *pFt                = totalForce;
-        pos                += gridDim.x * blockDim.x;
-    }
-    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy;
-}
-
-void kReduceGBVISoftcoreBornForces( freeEnergyGpuContext freeEnergyGpu )
-{
-    gpuContext gpu = freeEnergyGpu->gpuContext;
-    kReduceGBVISoftcoreBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
-    LAUNCHERROR("kReduceGBVISoftcoreBornForces");
-
-}
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void kReduceGBVISoftcoreBornSum_kernel()
-{
-    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
-    
-    while (pos < cSim.atoms)
-    {
-        float sum    = 0.0f;
-        float* pSt   = cSim.pBornSum + pos;
-        float4 atom  = cSim.pGBVIData[pos];
-        
-        // Get summed Born data
-        for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
-        {
-            sum += *pSt;
-            pSt += cSim.stride;
-        }
-        
-        // Now calculate Born radius
-
-        float Rinv           = 1.0f/atom.x;
-        sum                  = Rinv*Rinv*Rinv - sum; 
-        cSim.pBornRadii[pos] = pow( sum, (-1.0f/3.0f) ); 
-        pos                 += gridDim.x * blockDim.x;
-    }   
-}
-
-void kReduceGBVISoftcoreBornSum( freeEnergyGpuContext freeEnergyGpu )
-{
-    gpuContext gpu = freeEnergyGpu->gpuContext;
-    kReduceGBVISoftcoreBornSum_kernel<<<gpu->sim.blocks, gpu->sim.threads_per_block>>>();
-    LAUNCHERROR("kReduceGBVISoftcoreBornSum");
-}
-
-// Include versions of the kernels for N^2 calculations.
-
-#define METHOD_NAME(a, b) a##N2##b
-#include "kCalculateGBVISoftcoreBornSum.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateGBVISoftcoreBornSum.h"
-
-// Include versions of the kernels with cutoffs.
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_CUTOFF
-#define METHOD_NAME(a, b) a##Cutoff##b
-#include "kCalculateGBVISoftcoreBornSum.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##CutoffByWarp##b
-#include "kCalculateGBVISoftcoreBornSum.h"
-
-// Include versions of the kernels with periodic boundary conditions.
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_PERIODIC
-#define METHOD_NAME(a, b) a##Periodic##b
-#include "kCalculateGBVISoftcoreBornSum.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##PeriodicByWarp##b
-#include "kCalculateGBVISoftcoreBornSum.h"
-
- __device__ void quinticSpline( float  x, float rl, float ru, float* outValue, float* outDerivative )
-{
-   float numerator    = x  - rl;
-   float denominator  = ru - rl;
-   float ratio        = numerator/denominator;
-   float ratio2       = ratio*ratio;
-   float ratio3       = ratio2*ratio;
-
-   *outValue          =  1.0f + ratio3*(-10.f + 3.0f*ratio*(5.0f - 2.0f*ratio));
-   *outDerivative     = -30.0f*ratio2*( 1.0f + ratio*(ratio - 2.0f))/denominator;
-}
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void kReduceGBVIBornSumQuinticScaling_kernel()
-{
-    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
-    
-    while (pos < cSim.atoms)
-    {
-        float sum   = 0.0f;
-        float* pSt  = cSim.pBornSum + pos;
-        float4 atom = cSim.pGBVIData[pos];
-        
-        // Get summed Born data
-        for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
-        {
-            sum += *pSt;
-            pSt += cSim.stride;
-        }
-        
-        // Now calculate Born radius
-
-        float Rinv           = 1.0f/atom.x;
-        float r3             = Rinv*Rinv*Rinv;
-        float splineL        = gbviSimDev.quinticLowerLimitFactor*r3;
-        float switchDeriviative;
-        if( sum > splineL ){
-            if( sum < r3 ){
-                float splineValue, splineDerivative;
-                quinticSpline( sum, splineL, r3, &splineValue, &splineDerivative ); 
-                switchDeriviative  = splineValue - (r3 - sum)*splineDerivative;
-                sum                = (r3 - sum)*splineValue + gbviSimDev.quinticUpperLimit;
-            } else {
-                sum                = gbviSimDev.quinticUpperLimit;
-                switchDeriviative  = 0.0f;
-            }
-        } else {
-            sum               = r3 - sum;
-            switchDeriviative = 1.0f;
-        }
-
-        cSim.pBornRadii[pos]               = pow( sum, (-1.0f/3.0f) ); 
-        gbviSimDev.pSwitchDerivative[pos]  = switchDeriviative;
-        pos                               += gridDim.x * blockDim.x;
-    }   
-}
-
-void kReduceGBVIBornSumQuinticScaling( freeEnergyGpuContext freeEnergyGpu )
-{
-    gpuContext gpu = freeEnergyGpu->gpuContext;
-    kReduceGBVIBornSumQuinticScaling_kernel<<<gpu->sim.blocks, gpu->sim.threads_per_block>>>();
-    LAUNCHERROR("kReduceGBVIBornSumQuinticScaling_kernel");
-}
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void kReduceGBVIBornForcesQuinticScaling_kernel()
-{
-    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
-    float energy     = 0.0f;
-    while (pos < cSim.atoms)
-    {
-        float bornRadius    = cSim.pBornRadii[pos];
-        float4 gbviData     = cSim.pGBVIData[pos];
-        float  switchDeriv  = gbviSimDev.pSwitchDerivative[pos];
-        float totalForce    = 0.0f;
-        float* pFt          = cSim.pBornForce + pos;
-
-        int i = cSim.nonbondOutputBuffers;
-        while (i >= 4)
-        {
-            float f1    = *pFt;
-            pFt        += cSim.stride;
-            float f2    = *pFt;
-            pFt        += cSim.stride;
-            float f3    = *pFt;
-            pFt        += cSim.stride;
-            float f4    = *pFt;
-            pFt        += cSim.stride;
-            totalForce += f1 + f2 + f3 + f4;
-            i -= 4;
-        }
-        if (i >= 2)
-        {
-            float f1    = *pFt;
-            pFt        += cSim.stride;
-            float f2    = *pFt;
-            pFt        += cSim.stride;
-            totalForce += f1 + f2;
-            i -= 2;
-        }
-        if (i > 0)
-        {
-            totalForce += *pFt;
-        }
-
-        float ratio         = (gbviData.x/bornRadius);
-        float ratio3        = ratio*ratio*ratio;
-        energy             -= gbviData.z*ratio3;
-        totalForce         += (3.0f*gbviData.z*ratio3)/bornRadius; // 'cavity' term
-        float br2           = bornRadius*bornRadius;
-        totalForce         *= (1.0f/3.0f)*br2*br2*switchDeriv;
-
-        pFt                 = cSim.pBornForce + pos;
-        *pFt                = totalForce;
-        pos                += gridDim.x * blockDim.x;
-    }
-    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy;
-}
-
-void kReduceGBVIBornForcesQuinticScaling( freeEnergyGpuContext freeEnergyGpu )
-{
-    gpuContext gpu = freeEnergyGpu->gpuContext;
-
-    //(void) fprintf( stderr, "kReduceObcGbsaBornForces %6d blks=%u bsf_reduce_threads_per_block=%5u %5u   %5u %5u %5u\n",
-    //                 gpu->natoms, gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block, gpu->sim.bf_reduce_threads_per_block,
-    //                 GF1XX_THREADS_PER_BLOCK, GT2XX_THREADS_PER_BLOCK, G8X_THREADS_PER_BLOCK); fflush( stderr );
-    kReduceGBVIBornForcesQuinticScaling_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
-    LAUNCHERROR("kReduceGBVIBornForcesQuinticScaling");
-}
-
-void kPrintGBVISoftcore( freeEnergyGpuContext freeEnergyGpu, std::string callId, int call, FILE* log)
-{
-    gpuContext gpu = freeEnergyGpu->gpuContext;
-    //int maxPrint   = gpu->natoms;
-
-    gpu->psGBVIData->Download();
-    gpu->psBornRadii->Download();
-    gpu->psBornForce->Download();
-    gpu->psPosq4->Download();
-
-    CUDAStream<float>* switchDeriviative = freeEnergyGpu->psSwitchDerivative;
-    CUDAStream<float4>* sigEps4          = freeEnergyGpu->psSigEps4;
-
-    switchDeriviative->Download();
-    sigEps4->Download();
-
-    (void) fprintf( log, "kPrintGBVISoftcore Cuda Softcore bR bF swd   prm    sigeps4\n" );
-    for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
-        (void) fprintf( log, "%6d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e \n",
-                        ii, 
-                        gpu->psBornRadii->_pSysData[ii],
-                        gpu->psBornForce->_pSysData[ii],
-                        switchDeriviative->_pSysData[ii],
-
-                        gpu->psGBVIData->_pSysData[ii].x,
-                        gpu->psGBVIData->_pSysData[ii].y,
-                        gpu->psGBVIData->_pSysData[ii].z,
-                        gpu->psGBVIData->_pSysData[ii].w,
-
-                        sigEps4->_pSysData[ii].x,
-                        sigEps4->_pSysData[ii].y,
-                        sigEps4->_pSysData[ii].z,
-                        sigEps4->_pSysData[ii].w );
-
-    }
-
-}
-
-extern __global__ void kFindBlockBoundsCutoff_kernel();
-extern __global__ void kFindBlockBoundsPeriodic_kernel();
-
-extern __global__ void kFindBlocksWithInteractionsCutoff_kernel();
-extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
-
-extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*);
-extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*);
-
-void kCalculateGBVISoftcoreBornSum( freeEnergyGpuContext freeEnergyGpu )
-{
-
-    unsigned int threadsPerBlock;
-    static unsigned int threadsPerBlockPerMethod[3] = { 0, 0, 0 };
-    static unsigned int natoms[3]                   = { 0, 0, 0 };
-    gpuContext gpu                                  = freeEnergyGpu->gpuContext;
-    unsigned int methodIndex                        = static_cast<unsigned int>(freeEnergyGpu->freeEnergySim.nonbondedMethod);
-
-    if( methodIndex > 2 ){
-        throw OpenMM::OpenMMException( "kCalculateGBVISoftcoreBornSum method index invalid." );
-    }
-
-    if( natoms[methodIndex] != gpu->natoms ){
-        unsigned int extra                    = methodIndex == 0 ? 0 : sizeof(float);
-        threadsPerBlockPerMethod[methodIndex] = std::min(getThreadsPerBlockFEP( freeEnergyGpu, (sizeof(Atom) + extra), gpu->sharedMemoryPerBlock ), gpu->sim.nonbond_threads_per_block );
-        natoms[methodIndex]                   = gpu->natoms;
-    }   
-    threadsPerBlock = threadsPerBlockPerMethod[methodIndex];
-
-    kClearGBVISoftcoreBornSum( gpu );
-    LAUNCHERROR("kClearGBVIBornSum from kCalculateGBVISoftcoreBornSum");
-
-    switch (freeEnergyGpu->freeEnergySim.nonbondedMethod)
-    {   
-        case FREE_ENERGY_NO_CUTOFF:
-
-            if (gpu->bOutputBufferPerWarp){
-                kCalculateGBVISoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
-                        sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
-            } else {
-                kCalculateGBVISoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
-                        sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
-            }
-            break;
-
-        case FREE_ENERGY_CUTOFF:
-
-            kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
-            LAUNCHERROR("kFindBlockBoundsCutoff");
-            kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
-            LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
-            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-            kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
-                    sizeof(unsigned int)*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
-
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVISoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
-                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateGBVISoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
-                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
-
-            break;
-
-        case FREE_ENERGY_PERIODIC:
-
-            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
-            LAUNCHERROR("kFindBlockBoundsPeriodic");
-            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
-            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
-            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
-                    sizeof(unsigned int)*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
-
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVISoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
-                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
-            else
-                kCalculateGBVISoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
-                        (sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
-
-            break;
-
-        default:
-            throw OpenMM::OpenMMException( "Nonbonded softcore method not recognized." );
-
-    }
-    LAUNCHERROR("kCalculateGBVISoftcoreBornSum");
-
-}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-
-/**
- * This file contains the kernel for calculating Born sums.  It is included
- * several times in kCalculateGBVIBornSum.cu with different #defines to generate
- * different versions of the kernels.
- */
-
-#include "kCalculateGBVIAux.h"
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
-#endif
-void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int* workUnit)
-{
-    extern __shared__ Atom sA[];
-
-    unsigned int totalWarps   = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
-    unsigned int warp         = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    unsigned int numWorkUnits = cSim.pInteractionCount[0];
-    unsigned int pos          = warp*numWorkUnits/totalWarps;
-    unsigned int end          = (warp+1)*numWorkUnits/totalWarps;
-
-#ifdef USE_CUTOFF
-    float* tempBuffer         = (float*) &sA[cSim.nonbond_threads_per_block];
-#endif
-
-    while ( pos < end )
-    {
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x = workUnit[pos];
-        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
-        x              = (x >> 17)           << GRIDBITS;
-
-        float       dx;
-        float       dy;
-        float       dz;
-        float       r2;
-        float       r;
-
-        // forces tgx into interval [0,31] 
-        // forces tbx 0 
-        unsigned int tgx = threadIdx.x & (GRID - 1);
-        unsigned int tbx = threadIdx.x - tgx;
-        unsigned int tj  = tgx;
-        Atom* psA        = &sA[tbx];
-
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        {
-            // Read fixed atom data into registers and GRF
-            unsigned int i                          = x + tgx;
-            float4 apos                             = cSim.pPosq[i];    // Local atom x, y, z, sum
-            float4 ar                               = cSim.pGBVIData[i];  // Local atom vr, sr
-            sA[threadIdx.x].x                       = apos.x;
-            sA[threadIdx.x].y                       = apos.y;
-            sA[threadIdx.x].z                       = apos.z;
-            sA[threadIdx.x].r                       = ar.x;
-            sA[threadIdx.x].sr                      = ar.y;
-            sA[threadIdx.x].bornRadiusScaleFactor   = ar.w;
-            float bSum                              = 0.0f;
-
-            for (unsigned int j = 0; j < GRID; j++)
-            {
-                dx                                  = psA[j].x - apos.x;
-                dy                                  = psA[j].y - apos.y;
-                dz                                  = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                dx                                 -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                dy                                 -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                dz                                 -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                r2                                  = dx * dx + dy * dy + dz * dz;
-#if defined USE_CUTOFF
-                if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr && j != tgx)
-#else
-                if (i < cSim.atoms && x+j < cSim.atoms && j != tgx )
-#endif
-                {
-                    bSum  += psA[j].bornRadiusScaleFactor*getGBVI_Volume( sqrtf(r2), ar.x, psA[j].sr );
-
-                }
-            }
-
-            // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset = x + tgx + warp*cSim.stride;
-            cSim.pBornSum[offset] += bSum;
-#else
-            unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = bSum;
-#endif
-
-
-        } else {
-
-            // Read fixed atom data into registers and GRF
-            unsigned int j                              = y + tgx;
-            unsigned int i                              = x + tgx;
-
-            float4 temp                                 = cSim.pPosq[j];
-            float4 temp1                                = cSim.pGBVIData[j];
-
-            float4 apos                                 = cSim.pPosq[i];        // Local atom x, y, z, sum
-            float4 ar                                   = cSim.pGBVIData[i];    // Local atom vr, sr
-
-            sA[threadIdx.x].x                           = temp.x;
-            sA[threadIdx.x].y                           = temp.y;
-            sA[threadIdx.x].z                           = temp.z;
-
-            sA[threadIdx.x].r                           = temp1.x;
-            sA[threadIdx.x].sr                          = temp1.y;
-            sA[threadIdx.x].bornRadiusScaleFactor       = temp1.w;
-
-            sA[threadIdx.x].sum                         = 0.0f;
-            apos.w                                      = 0.0f;
-
-#ifdef USE_CUTOFF
-            unsigned int flags = cSim.pInteractionFlag[pos];
-            if (flags == 0)
-            {
-                // No interactions in this block.
-            }
-            else if (flags == 0xFFFFFFFF)
-            //else if (flags )
-#endif
-            {
-                // Compute all interactions within this block.
-
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    dx                      = psA[tj].x - apos.x;
-                    dy                      = psA[tj].y - apos.y;
-                    dz                      = psA[tj].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx                     -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy                     -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz                     -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    r2                      = dx * dx + dy * dy + dz * dz;
-#ifdef USE_CUTOFF
-                    if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#else
-                    if (i < cSim.atoms && y+tj < cSim.atoms )
-#endif
-                    {
-                        r                       = sqrtf(r2);
-
-                        // psA[tj].sr = Sj
-                        // ar.x       = Ri
-
-                        apos.w                 += psA[tj].bornRadiusScaleFactor*getGBVI_Volume( r, ar.x, psA[tj].sr );
-                        psA[tj].sum            += ar.w*getGBVI_Volume( r, psA[tj].r, ar.y );
-
-                    }
-                    tj = (tj - 1) & (GRID - 1);
-                }
-
-            }
-#ifdef USE_CUTOFF
-            else
-            {
-                // Compute only a subset of the interactions in this block.
-
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    if ((flags&(1<<j)) != 0)
-                    {
-                        tempBuffer[threadIdx.x] = 0.0f;
-                        dx                      = psA[j].x - apos.x;
-                        dy                      = psA[j].y - apos.y;
-                        dz                      = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                        dx                     -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                        dy                     -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                        dz                     -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                        r2                      = dx * dx + dy * dy + dz * dz;
-#ifdef USE_CUTOFF
-                        if (i < cSim.atoms && y+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#else
-                        if (i < cSim.atoms && y+j < cSim.atoms)
-#endif
-                        {
-                            r                       = sqrtf(r2);
-                            tempBuffer[threadIdx.x] = ar.w*getGBVI_Volume( r, psA[j].r, ar.y );
-                            apos.w                 += psA[j].bornRadiusScaleFactor*getGBVI_Volume( r, ar.x, psA[j].sr );
-                        }
-
-                        // Sum the terms.
-
-                        if (tgx % 2 == 0)
-                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
-                        if (tgx % 4 == 0)
-                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
-                        if (tgx % 8 == 0)
-                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
-                        if (tgx % 16 == 0)
-                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
-                        if (tgx == 0)
-                            psA[j].sum += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
-                    }
-                }
-            }
-#endif
-
-            // Write results
-
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset = x + tgx + warp*cSim.stride;
-            cSim.pBornSum[offset] += apos.w;
-            offset = y + tgx + warp*cSim.stride;
-            cSim.pBornSum[offset] += sA[threadIdx.x].sum;
-#else
-            unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = apos.w;
-            offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = sA[threadIdx.x].sum;
-#endif
-
-        }
-
-        pos++;
-    }
-}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-
-#include "gputypes.h"
-#include "GpuFreeEnergyCudaKernels.h"
-#include "openmm/OpenMMException.h"
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-
-using namespace std;
-
-struct Atom {
-    float x;
-    float y;
-    float z;
-    float r;
-    float sr;
-    float fx;
-    float fy;
-    float fz;
-    float fb;
-    float bornRadiusScaleFactor;
-};
-
-
-static __constant__ cudaGmxSimulation cSim;
-
-void SetCalculateGBVISoftcoreForces2Sim( freeEnergyGpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->gpuContext->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateGBVISoftcoreForces2Sim copy to cSim failed");
-}
-
-#include "kCalculateGBVIAux.h"
-
-// Include versions of the kernels for N^2 calculations.
-
-#define METHOD_NAME(a, b) a##N2##b
-#include "kCalculateGBVISoftcoreForces2.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateGBVISoftcoreForces2.h"
-
-// Include versions of the kernels with cutoffs.
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_CUTOFF
-#define METHOD_NAME(a, b) a##Cutoff##b
-#include "kCalculateGBVISoftcoreForces2.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##CutoffByWarp##b
-#include "kCalculateGBVISoftcoreForces2.h"
-
-// Include versions of the kernels with periodic boundary conditions.
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_PERIODIC
-#define METHOD_NAME(a, b) a##Periodic##b
-#include "kCalculateGBVISoftcoreForces2.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##PeriodicByWarp##b
-#include "kCalculateGBVISoftcoreForces2.h"
-
-void kCalculateGBVISoftcoreForces2( freeEnergyGpuContext freeEnergyGpu )
-{
-    unsigned int threadsPerBlock;
-    static unsigned int threadsPerBlockPerMethod[3] = { 0, 0, 0 };
-    static unsigned int natoms[3]                   = { 0, 0, 0 };
-    gpuContext gpu                                  = freeEnergyGpu->gpuContext;
-    unsigned int methodIndex                        = static_cast<unsigned int>(freeEnergyGpu->freeEnergySim.nonbondedMethod);
-
-    if( methodIndex > 2 ){
-        throw OpenMM::OpenMMException( "kCalculateGBVISoftcoreForces2 method index invalid." );
-    }   
-
-    if( natoms[methodIndex] != gpu->natoms ){
-        unsigned int extra                    = methodIndex == 0 ? 0 : sizeof(float3);
-        threadsPerBlockPerMethod[methodIndex] = std::min(getThreadsPerBlockFEP( freeEnergyGpu, (sizeof(Atom) + extra), gpu->sharedMemoryPerBlock ), gpu->sim.nonbond_threads_per_block );
-        natoms[methodIndex]                   = gpu->natoms;
-    }
-    threadsPerBlock = threadsPerBlockPerMethod[methodIndex];
-
-    switch (freeEnergyGpu->freeEnergySim.nonbondedMethod)
-    {
-        case FREE_ENERGY_NO_CUTOFF:
-
-
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
-                        sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
-            else
-                kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
-                        sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
-            break;
-
-        case FREE_ENERGY_CUTOFF:
-
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVISoftcoreCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
-                        (sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
-            else
-                kCalculateGBVISoftcoreCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
-                        (sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
-            break;
-
-        case FREE_ENERGY_PERIODIC:
-
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVISoftcorePeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
-                        (sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
-            else
-                kCalculateGBVISoftcorePeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
-                        (sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
-            break;
-
-    }
-    LAUNCHERROR("kCalculateGBVISoftcoreForces2");
-}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreForces2.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-
-#include "kCalculateGBVIAux.h"
-
-/**
- * This file contains the kernel for evaluating the second stage of GB/VI.  It is included
- * several times in kCalculateGBVIForces2.cu with different #defines to generate
- * different versions of the kernels.
- */
-
-__global__
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1)
-#endif
-void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int* workUnit )
-{
-    extern __shared__ Atom sA[];
-    unsigned int totalWarps   = gridDim.x*blockDim.x/GRID;
-    unsigned int warp         = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-
-    unsigned int numWorkUnits = cSim.pInteractionCount[0];
-    unsigned int pos          = warp*numWorkUnits/totalWarps;
-    unsigned int end          = (warp+1)*numWorkUnits/totalWarps;
-#ifdef USE_CUTOFF
-    float3* tempBuffer        = (float3*) &sA[blockDim.x];
-#endif
-
-    unsigned int lasty = -0xFFFFFFFF;
-    while (pos < end)
-    {
-
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x                  = workUnit[pos];
-        unsigned int y                  = ((x >> 2) & 0x7fff) << GRIDBITS;
-        x                               = (x >> 17) << GRIDBITS;
-        unsigned int tgx                = threadIdx.x & (GRID - 1);
-        unsigned int i                  = x + tgx;
-        float4 apos                     = cSim.pPosq[i];
-        float4 ar                       = cSim.pGBVIData[i];
-        float fb                        = cSim.pBornForce[i];
-        unsigned int tbx                = threadIdx.x - tgx;
-        unsigned int tj                 = tgx;
-
-        Atom* psA                       = &sA[tbx];
-        sA[threadIdx.x].fx              = 0.0f;
-        sA[threadIdx.x].fy              = 0.0f;
-        sA[threadIdx.x].fz              = 0.0f;
-
-        float3 af;
-        af.x                            = 0.0f;
-        af.y                            = 0.0f;
-        af.z                            = 0.0f;
-
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        {
-            // Read fixed atom data into registers and GRF
-
-            sA[threadIdx.x].x                       = apos.x;
-            sA[threadIdx.x].y                       = apos.y;
-            sA[threadIdx.x].z                       = apos.z;
-            sA[threadIdx.x].r                       = ar.x;
-            sA[threadIdx.x].sr                      = ar.y;
-            sA[threadIdx.x].bornRadiusScaleFactor   = ar.w;
-            sA[threadIdx.x].fb                      = fb;
-
-            for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1))
-            {
-
-                float dx                = psA[j].x - apos.x;
-                float dy                = psA[j].y - apos.y;
-                float dz                = psA[j].z - apos.z;
-
-#ifdef USE_PERIODIC
-                dx                     -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                dy                     -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                dz                     -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                float r2                = dx * dx + dy * dy + dz * dz;
-                float r                 = sqrtf(r2);
-
-                // Atom I Born forces and sum
-                float dE                = psA[j].bornRadiusScaleFactor*getGBVI_dE2( r, ar.x, psA[j].sr, fb );
-#if defined USE_CUTOFF
-                if (i >= cSim.atoms || x+j >= cSim.atoms || (i == (x+j)) || r2 > cSim.nonbondedCutoffSqr)
-#else
-                if(i >= cSim.atoms || x+j >= cSim.atoms || (i == (x+j)) )
-#endif
-                {
-                    dE              = 0.0f;
-                }
-
-                float d             = dx * dE;
-                af.x               -= d;
-                psA[j].fx          += d;
-
-                d                   = dy * dE;
-                af.y               -= d;
-                psA[j].fy          += d;
-
-                d                   = dz * dE;
-                af.z               -= d;
-                psA[j].fz          += d;
-            }
-
-            // Write results
-
-            float4 of;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset         = x + tgx + warp*cSim.stride;
-            of                          = cSim.pForce4[offset];
-            of.x                       += af.x + sA[threadIdx.x].fx;
-            of.y                       += af.y + sA[threadIdx.x].fy;
-            of.z                       += af.z + sA[threadIdx.x].fz;
-            cSim.pForce4[offset]        = of;
-#else
-            unsigned int offset         = x + tgx + (x >> GRIDBITS) * cSim.stride;
-            of                          = cSim.pForce4[offset];
-            of.x                       += af.x + sA[threadIdx.x].fx;
-            of.y                       += af.y + sA[threadIdx.x].fy;
-            of.z                       += af.z + sA[threadIdx.x].fz;
-            of.w                        = 0.0f;
-            cSim.pForce4[offset]        = of;
-#endif
-        }
-        else
-        {
-            // Read fixed atom data into registers and GRF
-            if (lasty != y)
-            {
-                unsigned int j                          = y + tgx;
-                float4 temp                             = cSim.pPosq[j];
-                float4 temp1                            = cSim.pGBVIData[j];
-                float fb                                = cSim.pBornForce[j];
-                sA[threadIdx.x].fb                      = fb;
-                sA[threadIdx.x].x                       = temp.x;
-                sA[threadIdx.x].y                       = temp.y;
-                sA[threadIdx.x].z                       = temp.z;
-                sA[threadIdx.x].r                       = temp1.x;
-                sA[threadIdx.x].sr                      = temp1.y;
-                sA[threadIdx.x].bornRadiusScaleFactor   = temp1.w;
-            }
-#ifdef USE_CUTOFF
-            unsigned int flags = cSim.pInteractionFlag[pos];
-            if (flags == 0)
-            {
-                // No interactions in this block.
-            }
-            //else if (flags == 0xFFFFFFFF)
-            else if (flags)
-#endif
-            {
-                // Compute all interactions within this block.
-
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float dx                = psA[tj].x - apos.x;
-                    float dy                = psA[tj].y - apos.y;
-                    float dz                = psA[tj].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx                     -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy                     -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz                     -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    float r2                = dx * dx + dy * dy + dz * dz;
-                    float r                 = sqrtf(r2);
-
-                    float dE                = psA[tj].bornRadiusScaleFactor*getGBVI_dE2( r, ar.x, psA[tj].sr, fb );
-
-#if defined USE_CUTOFF
-                    if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                    if (i >= cSim.atoms || y+tj >= cSim.atoms )
-#endif
-                    {
-                        dE                  = 0.0f;
-                    }
-
-
-                    float d                 = dx * dE;
-                    af.x                   -= d;
-                    psA[tj].fx             += d;
-                    d                       = dy * dE;
-                    af.y                   -= d;
-                    psA[tj].fy             += d;
-                    d                       = dz * dE;
-                    af.z                   -= d;
-                    psA[tj].fz             += d;
-
-                    // Atom J Born sum term
-                    dE                      = ar.w*getGBVI_dE2( r, psA[tj].r, ar.y, psA[tj].fb );
-
-#if defined USE_CUTOFF
-                    if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                    if (i >= cSim.atoms || y+tj >= cSim.atoms )
-#endif
-                    {
-                        dE                  = 0.0f;
-                    }
-
-                    dx                     *= dE;
-                    dy                     *= dE;
-                    dz                     *= dE;
-                    psA[tj].fx             += dx;
-                    psA[tj].fy             += dy;
-                    psA[tj].fz             += dz;
-                    af.x                   -= dx;
-                    af.y                   -= dy;
-                    af.z                   -= dz;
-                    tj                      = (tj + 1) & (GRID - 1);
-                }
-            }
-#ifdef USE_CUTOFF
-            else
-            {
-                // Compute only a subset of the interactions in this block.
-
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    if ((flags&(1<<j)) != 0)
-                    {
-                        float dx                = psA[j].x - apos.x;
-                        float dy                = psA[j].y - apos.y;
-                        float dz                = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                        dx                     -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                        dy                     -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                        dz                     -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                        float r2                = dx * dx + dy * dy + dz * dz;
-                        float r                 = sqrtf(r2);
-
-                        // Interleaved Atom I and J Born Forces and sum components
-                        float dE                = psA[j].bornRadiusScaleFactor*getGBVI_dE2( r, ar.x, psA[j].sr, fb );
-
-#if defined USE_CUTOFF
-                        if (i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                        if (i >= cSim.atoms || y+j >= cSim.atoms )
-#endif
-                        {
-                            dE                  = 0.0f;
-                        }
-
-                        float d                 = dx * dE;
-                        af.x                   -= d;
-                        tempBuffer[threadIdx.x].x = d;
-                        d                       = dy * dE;
-                        af.y                   -= d;
-                        tempBuffer[threadIdx.x].y = d;
-                        d                       = dz * dE;
-                        af.z                   -= d;
-                        tempBuffer[threadIdx.x].z = d;
-
-                        // Atom J Born sum term
-                        dE                      = ar.w*getGBVI_dE2( r, psA[j].r, ar.y, psA[j].fb );
-
-#if defined USE_CUTOFF
-                        if (i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                        if (i >= cSim.atoms || y+j >= cSim.atoms )
-#endif
-                        {
-                            dE                  = 0.0f;
-                        }
-
-                        dx                     *= dE;
-                        dy                     *= dE;
-                        dz                     *= dE;
-                        tempBuffer[threadIdx.x].x += dx;
-                        tempBuffer[threadIdx.x].y += dy;
-                        tempBuffer[threadIdx.x].z += dz;
-                        af.x                   -= dx;
-                        af.y                   -= dy;
-                        af.z                   -= dz;
-
-                        // Sum the forces on atom j.
-
-                        if (tgx % 2 == 0)
-                        {
-                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
-                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
-                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
-                        }
-                        if (tgx % 4 == 0)
-                        {
-                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
-                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
-                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
-                        }
-                        if (tgx % 8 == 0)
-                        {
-                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
-                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
-                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
-                        }
-                        if (tgx % 16 == 0)
-                        {
-                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
-                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
-                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
-                        }
-                        if (tgx == 0)
-                        {
-                            psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
-                            psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
-                            psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
-                        }
-                    }
-                }
-            }
-#endif
-
-            // Write results
-            float4 of;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset         = x + tgx + warp*cSim.stride;
-            of                          = cSim.pForce4[offset];
-            of.x                       += af.x;
-            of.y                       += af.y;
-            of.z                       += af.z;
-            cSim.pForce4[offset]        = of;
-            offset                      = y + tgx + warp*cSim.stride;
-            of                          = cSim.pForce4[offset];
-            of.x                       += sA[threadIdx.x].fx;
-            of.y                       += sA[threadIdx.x].fy;
-            of.z                       += sA[threadIdx.x].fz;
-            cSim.pForce4[offset]        = of;
-#else
-            unsigned int offset         = x + tgx + (y >> GRIDBITS) * cSim.stride;
-            of                          = cSim.pForce4[offset];
-            of.x                       += af.x;
-            of.y                       += af.y;
-            of.z                       += af.z;
-            of.w                        = 0.0f;
-            cSim.pForce4[offset]        = of;
-
-            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
-            of                          = cSim.pForce4[offset];
-            of.x                       += sA[threadIdx.x].fx;
-            of.y                       += sA[threadIdx.x].fy;
-            of.z                       += sA[threadIdx.x].fz;
-            cSim.pForce4[offset]        = of;
-#endif
-        }
-        lasty = y;
-        pos++;
-    }
-}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateLocalSoftcoreForces.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateLocalSoftcoreForces.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "GpuFreeEnergyCudaKernels.h"
-#include "freeEnergyGpuTypes.h"
-#include <cudatypes.h>
-#include "kSoftcoreLJ.h"
-
-#define PARAMETER_PRINT 0
-#define MAX_PARAMETER_PRINT 10
-
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaFreeEnergyGmxSimulation feSim;
-
-void SetCalculateLocalSoftcoreGpuSim( freeEnergyGpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->gpuContext->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateLocalSoftcoreGpuSim copy to cSim failed");
-
-    status = cudaMemcpyToSymbol(feSim, &gpu->freeEnergySim, sizeof(cudaFreeEnergyGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateLocalSoftcoreGpuSim copy to feSim failed");
-
-}
-
-extern "C"
-void gpuSetLJ14SoftcoreParameters( freeEnergyGpuContext gpu, float epsfac, const std::vector<int>& atom1, const std::vector<int>& atom2,
-                                   const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& qProd,
-                                   const std::vector<float>& softcoreLJLambdaArray ){
-
-    unsigned int LJ14s                                  = atom1.size();
-    gpu->freeEnergySim.LJ14_count                       = LJ14s;
-
-    gpu->psLJ14ID                                       = new CUDAStream<int4>(LJ14s, 1, "LJ14SoftcoreID");
-    CUDAStream<int4>* psLJ14ID                          = gpu->psLJ14ID;
-    gpu->freeEnergySim.pLJ14ID                          = psLJ14ID->_pDevData;
-
-    gpu->psLJ14Parameter                                = new CUDAStream<float4>(LJ14s, 1, "LJ14SoftcoreParameter");
-    CUDAStream<float4>* psLJ14Parameter                 = gpu->psLJ14Parameter;
-    gpu->freeEnergySim.pLJ14Parameter  = psLJ14Parameter->_pDevData;
-
-    std::vector<int> outputBufferCounter( gpu->gpuContext->sim.atoms, 0 );
-
-    for( int ii = 0; ii < LJ14s; ii++ ){
-
-        (*psLJ14ID)[ii].x          = atom1[ii];
-        (*psLJ14ID)[ii].y          = atom2[ii];
-        (*psLJ14ID)[ii].z          = outputBufferCounter[atom1[ii]]++;
-        (*psLJ14ID)[ii].w          = outputBufferCounter[atom2[ii]]++;
-
-        float p0, p1, p2, p3;
-        if( c12[ii] == 0.0f ){
-            p0 = 0.0f;
-            p1 = 1.0f;
-
-        } else {
-            p0 = c6[ii] * c6[ii] / c12[ii];
-            p1 = pow(c12[ii] / c6[ii], 1.0f / 6.0f);
-        }
-
-        p2                       = epsfac*qProd[ii];
-        p3                       = softcoreLJLambdaArray[ii];
-
-        (*psLJ14Parameter)[ii].x = p0;
-        (*psLJ14Parameter)[ii].y = p1;
-        (*psLJ14Parameter)[ii].z = p2;
-        (*psLJ14Parameter)[ii].w = p3;
-    }
-
-    // logging info
-
-    if( gpu->log ){
-        (void) fprintf( gpu->log, "gpuSetLJ14SoftcoreParameters: number of 1-4 bonds=%5u\n", LJ14s );
-#ifdef PARAMETER_PRINT
-        unsigned int maxPrint = MAX_PARAMETER_PRINT;
-        for( unsigned int ii = 0; ii < LJ14s; ii++ ){
-            (void) fprintf( gpu->log, "    %5d [%5d %5d %5d %5d] %15.7e %15.7e %15.7e %15.7e\n",
-                            ii, (*psLJ14ID)[ii].x, (*psLJ14ID)[ii].y, (*psLJ14ID)[ii].z, (*psLJ14ID)[ii].w, 
-                            (*psLJ14Parameter)[ii].x, (*psLJ14Parameter)[ii].y,
-                            (*psLJ14Parameter)[ii].z/epsfac, (*psLJ14Parameter)[ii].w );
-            if( ii == maxPrint ){
-                (void) fprintf( gpu->log, "\n" );
-                ii = LJ14s - maxPrint;
-                if( ii < maxPrint )ii = maxPrint;
-            }    
-        }    
-        (void) fprintf( gpu->log, "\n" );
-#endif
-        (void) fflush( gpu->log );
-    }    
-
-    psLJ14ID->Upload();
-    psLJ14Parameter->Upload();
-
-    return;
-}
-
-#define DOT3(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z)
-
-__global__ void kCalculateLocalSoftcoreForces_kernel()
-{
-    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-
-    float energy     = 0.0f;
-
-    if (feSim.nonbondedMethod == NO_CUTOFF)
-    {
-        while (pos < feSim.LJ14_count)
-        {
-            int4 atom               = feSim.pLJ14ID[pos];
-            float4 LJ14             = feSim.pLJ14Parameter[pos];
-
-            float4 a1               = cSim.pPosq[atom.x];
-            float4 a2               = cSim.pPosq[atom.y];
-
-            float3 d;
-            d.x                     = a1.x - a2.x;
-            d.y                     = a1.y - a2.y;
-            d.z                     = a1.z - a2.z;
-
-            float r2                = DOT3(d, d);
-            float inverseR          = 1.0f / sqrtf(r2);
-#ifdef USE_SOFTCORE_LJ
-            float CDLJ_energy       = 0.0f;
-            float dEdR              = getSoftCoreLJ( r2, LJ14.y, LJ14.x, LJ14.w, LJ14.w, &CDLJ_energy );
-            energy                 += CDLJ_energy;
-#else
-            float sig2              = inverseR * LJ14.y;
-            sig2                   *= sig2;
-            float sig6              = sig2 * sig2 * sig2;
-            float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
-            energy                 += LJ14.x * (sig6 - 1.0f) * sig6;
-#endif
-            energy                 += LJ14.z * inverseR;
-            dEdR                   += LJ14.z * inverseR;
-            dEdR                   *= inverseR * inverseR;
-            unsigned int offsetA    = atom.x + atom.z * cSim.stride;
-            unsigned int offsetB    = atom.y + atom.w * cSim.stride;
-            float4 forceA           = cSim.pForce4[offsetA];
-            float4 forceB           = cSim.pForce4[offsetB];
-            d.x                    *= dEdR;
-            d.y                    *= dEdR;
-            d.z                    *= dEdR;
-            forceA.x               += d.x;
-            forceA.y               += d.y;
-            forceA.z               += d.z;
-            forceB.x               -= d.x;
-            forceB.y               -= d.y;
-            forceB.z               -= d.z;
-            cSim.pForce4[offsetA]   = forceA;
-            cSim.pForce4[offsetB]   = forceB;
-            pos                    += blockDim.x * gridDim.x;
-        }
-
-    } else if (feSim.nonbondedMethod == CUTOFF) {
-        float LJ14_energy;
-        while (pos < feSim.LJ14_count ){
-            int4 atom               = feSim.pLJ14ID[pos];
-            float4 LJ14             = feSim.pLJ14Parameter[pos];
-            float4 a1               = cSim.pPosq[atom.x];
-            float4 a2               = cSim.pPosq[atom.y];
-            float3 d;
-            d.x                     = a1.x - a2.x;
-            d.y                     = a1.y - a2.y;
-            d.z                     = a1.z - a2.z;
-            float r2                = DOT3(d, d);
-            float inverseR          = 1.0f / sqrtf(r2);
-#ifdef USE_SOFTCORE_LJ
-            float dEdR              = getSoftCoreLJ( r2, LJ14.y, LJ14.x, LJ14.w, LJ14.w, &LJ14_energy);
-#else
-            float sig2              = inverseR * LJ14.y;
-            sig2                   *= sig2;
-            float sig6              = sig2 * sig2 * sig2;
-            float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;                
-            LJ14_energy             = LJ14.x * (sig6 - 1.0f) * sig6;
-#endif
-            LJ14_energy            += LJ14.z * (inverseR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-            dEdR                   += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
-            dEdR                   *= inverseR * inverseR;
-            if (r2 > feSim.nonbondedCutoffSqr)
-            {                   
-                dEdR = 0.0f;
-                LJ14_energy = 0.0f;
-            }
-            energy                 += LJ14_energy;
- 
-            unsigned int offsetA    = atom.x + atom.z * cSim.stride;
-            unsigned int offsetB    = atom.y + atom.w * cSim.stride;
-            float4 forceA           = cSim.pForce4[offsetA];
-            float4 forceB           = cSim.pForce4[offsetB];
-            d.x                    *= dEdR;
-            d.y                    *= dEdR;
-            d.z                    *= dEdR;
-            forceA.x               += d.x;
-            forceA.y               += d.y;
-            forceA.z               += d.z;
-            forceB.x               -= d.x;
-            forceB.y               -= d.y;
-            forceB.z               -= d.z;
-            cSim.pForce4[offsetA]   = forceA;
-            cSim.pForce4[offsetB]   = forceB;
-            pos                    += blockDim.x * gridDim.x;
-        }
-    } else if (feSim.nonbondedMethod == PERIODIC ){
-        float LJ14_energy;
-        while (pos < feSim.LJ14_count ){
-            int4 atom               = feSim.pLJ14ID[pos];
-            float4 LJ14             = feSim.pLJ14Parameter[pos];
-            float4 a1               = cSim.pPosq[atom.x];
-            float4 a2               = cSim.pPosq[atom.y];
-            float3 d;
-            d.x                     = a1.x - a2.x;
-            d.y                     = a1.y - a2.y;
-            d.z                     = a1.z - a2.z;
-            d.x                    -= floorf(d.x/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-            d.y                    -= floorf(d.y/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-            d.z                    -= floorf(d.z/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-            float r2                = DOT3(d, d);
-            float inverseR          = 1.0f / sqrtf(r2);
-#ifdef USE_SOFTCORE_LJ
-            float dEdR              = getSoftCoreLJ( r2, LJ14.y, LJ14.x, LJ14.w, LJ14.w, &LJ14_energy);
-#else
-            float sig2              = inverseR * LJ14.y;
-            sig2                   *= sig2;
-            float sig6              = sig2 * sig2 * sig2;
-            float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
-            LJ14_energy             = LJ14.x * (sig6 - 1.0f) * sig6;
-#endif
-            LJ14_energy            += LJ14.z * (inverseR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-
-            dEdR                   += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
-            dEdR                   *= inverseR * inverseR;
-            if (r2 > feSim.nonbondedCutoffSqr)
-            {
-                dEdR = 0.0f;
-                LJ14_energy = 0.0f;
-            }
-            energy                 += LJ14_energy;
-
-            unsigned int offsetA    = atom.x + atom.z * cSim.stride;
-            unsigned int offsetB    = atom.y + atom.w * cSim.stride;
-            float4 forceA           = cSim.pForce4[offsetA];
-            float4 forceB           = cSim.pForce4[offsetB];
-            d.x                    *= dEdR;
-            d.y                    *= dEdR;
-            d.z                    *= dEdR;
-            forceA.x               += d.x;
-            forceA.y               += d.y;
-            forceA.z               += d.z;
-            forceB.x               -= d.x;
-            forceB.y               -= d.y;
-            forceB.z               -= d.z;
-            cSim.pForce4[offsetA]   = forceA;
-            cSim.pForce4[offsetB]   = forceB;
-            pos                    += blockDim.x * gridDim.x;
-        }
-    }
-    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy;
-}
-
-void kCalculateLocalSoftcoreForces( freeEnergyGpuContext freeEnergyGpuContext )
-{
-    gpuContext gpu = freeEnergyGpuContext->gpuContext;
-    kCalculateLocalSoftcoreForces_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block, gpu->sim.localForces_threads_per_block * sizeof(Vectors)>>>();
-    LAUNCHERROR("kCalculateLocalSoftcoreForces");
-}
-
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateNonbondedSoftcore.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateNonbondedSoftcore.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "freeEnergyGpuTypes.h"
-#include "GpuFreeEnergyCudaKernels.h"
-#include "openmm/OpenMMException.h"
-#include <iostream>
-#include <sstream>
-
-#define PARAMETER_PRINT 0
-#define MAX_PARAMETER_PRINT 10
-
-// device handles
-
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaFreeEnergyGmxSimulation feSimDev;
-
-// write address of structs to devices
-
-void SetCalculateCDLJSoftcoreGpuSim( freeEnergyGpuContext freeEnergyGpu ){
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &freeEnergyGpu->gpuContext->sim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateCDLJSoftcoreGpuSim copy to cSim failed");
-
-    status = cudaMemcpyToSymbol( feSimDev, &freeEnergyGpu->freeEnergySim, sizeof(cudaFreeEnergyGmxSimulation));
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateCDLJSoftcoreGpuSim copy to feSimDev failed");
-}
-
-extern "C"
-bool gpuIsAvailableSoftcore()
-{
-    int deviceCount;
-    cudaGetDeviceCount(&deviceCount);
-    return (deviceCount > 0);
-}
-
-struct Atom {
-    float x;
-    float y;
-    float z;
-    float q;
-    float sig;
-    float eps;
-    float softCoreLJLambda;
-    float fx;
-    float fy;
-    float fz;
-};
-
-// Include versions of the kernels for N^2 calculations with softcore LJ.
-
-#define USE_SOFTCORE_LJ
-#ifdef USE_SOFTCORE_LJ
-#include "kSoftcoreLJ.h"
-#endif
-
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2SoftcoreLJ##b
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#include "kCalculateNonbondedSoftcore.h"
-
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2SoftcoreLJByWarp##b
-#include "kCalculateNonbondedSoftcore.h"
-
-// Include versions of the kernels with cutoffs.
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_CUTOFF
-#define METHOD_NAME(a, b) a##Cutoff##b
-#include "kCalculateNonbondedSoftcore.h"
-#include "kFindInteractingBlocks.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##CutoffByWarp##b
-#include "kCalculateNonbondedSoftcore.h"
-
-// Include versions of the kernels with periodic boundary conditions.
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_PERIODIC
-#define METHOD_NAME(a, b) a##Periodic##b
-#include "kCalculateNonbondedSoftcore.h"
-#include "kFindInteractingBlocks.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##PeriodicByWarp##b
-#include "kCalculateNonbondedSoftcore.h"
-
-void kCalculateCDLJSoftcoreForces( freeEnergyGpuContext freeEnergyGpu )
-{
-
-    gpuContext gpu = freeEnergyGpu->gpuContext;
-    switch (freeEnergyGpu->freeEnergySim.nonbondedMethod)
-    {
-        case FREE_ENERGY_NO_CUTOFF:
-
-           if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJSoftcoreN2SoftcoreLJByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                         sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
-           else
-                   kCalculateCDLJSoftcoreN2SoftcoreLJForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                           sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit );
-            LAUNCHERROR("kCalculateCDLJSoftcoreN2Forces");
-
-            break;
-
-        case FREE_ENERGY_CUTOFF:
-
-            kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
-            LAUNCHERROR("kFindBlockBoundsCutoff");
-            kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
-            LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
-            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-            kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJSoftcoreCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateCDLJSoftcoreCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-
-            LAUNCHERROR("kCalculateCDLJSoftcoreCutoffForces");
-            break;
-
-        case FREE_ENERGY_PERIODIC:
-
-            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
-            LAUNCHERROR("kFindBlockBoundsPeriodic");
-            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
-            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
-            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJSoftcorePeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateCDLJSoftcorePeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            LAUNCHERROR("kCalculateCDLJSoftcorePeriodicForces");
-            break;
-
-        default:
-            throw OpenMM::OpenMMException( "Nonbonded softcore method not recognized." );
-
-    }
-}
-
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateNonbondedSoftcore.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateNonbondedSoftcore.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-/**
- * This file contains the kernels for evaluating nonbonded softcore forces.  It is included
- * several times in kCalculateNonbondedSoftcore.cu with different #defines to generate
- * different versions of the kernels.
- */
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
-#endif
-void METHOD_NAME(kCalculateCDLJSoftcore, Forces_kernel)(unsigned int* workUnit )
-{
-    extern __shared__ Atom sA[];
-    unsigned int totalWarps   = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
-    unsigned int warp         = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    unsigned int numWorkUnits = cSim.pInteractionCount[0];
-    unsigned int pos          = warp*numWorkUnits/totalWarps;
-    unsigned int end          = (warp+1)*numWorkUnits/totalWarps;
-    float CDLJ_energy;
-    float energy              = 0.0f;
-#ifdef USE_CUTOFF
-    float3* tempBuffer        = (float3*) &sA[cSim.nonbond_threads_per_block];
-#endif
-
-    unsigned int lasty = 0xFFFFFFFF;
-    while (pos < end)
-    {
-
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x = workUnit[pos];
-        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
-        bool bExclusionFlag = (x & 0x1);
-        x = (x >> 17) << GRIDBITS;
-        float4      apos;   // Local atom x, y, z, q
-        float3      af;     // Local atom fx, fy, fz
-        float dx;
-        float dy;
-        float dz;
-        float r2;
-        float invR;
-        float sig;
-        float eps;
-        float dEdR;
-
-        unsigned int tgx             = threadIdx.x & (GRID - 1);
-        unsigned int tbx             = threadIdx.x - tgx;
-        unsigned int tj              = tgx;
-
-        Atom* psA                    = &sA[tbx];
-        unsigned int i               = x + tgx;
-
-        apos                         = cSim.pPosq[i];
-        float4 a                     = feSimDev.pSigEps4[i];
-        float softCoreLJLambda       = a.z;
-        af.x                         = 0.0f;
-        af.y                         = 0.0f;
-        af.z                         = 0.0f;
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        {
-            // Read fixed atom data into registers and GRF
-            sA[threadIdx.x].x                     = apos.x;
-            sA[threadIdx.x].y                     = apos.y;
-            sA[threadIdx.x].z                     = apos.z;
-            sA[threadIdx.x].q                     = a.w;
-            sA[threadIdx.x].sig                   = a.x;
-            sA[threadIdx.x].eps                   = a.y;
-            sA[threadIdx.x].softCoreLJLambda      = a.z;
-            a.w                                  *= cSim.epsfac;
-            if (!bExclusionFlag)
-            {
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    dx              = psA[j].x - apos.x;
-                    dy              = psA[j].y - apos.y;
-                    dz              = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx             -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy             -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz             -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    r2              = dx * dx + dy * dy + dz * dz;
-                    invR            = 1.0f / sqrtf(r2);
-                    sig             = a.x + psA[j].sig;
-                    eps             = a.y * psA[j].eps;
-#ifdef USE_SOFTCORE_LJ
-                    dEdR            = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJ_energy );
-#else
-                    float sig2      = invR * sig;
-                    sig2           *= sig2;
-                    float sig6      = sig2 * sig2 * sig2;
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
-	          	     CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
-#endif
-
-#ifdef USE_CUTOFF
-                    dEdR           += a.w * psA[j].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
-		              CDLJ_energy    += a.w * psA[j].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
-#else
-                    dEdR           += a.w * psA[j].q * invR;
-		              CDLJ_energy    += a.w * psA[j].q * invR;
-#endif
-                    dEdR           *= invR * invR;
-#ifdef USE_CUTOFF
-                    if (r2 > cSim.nonbondedCutoffSqr)
-                    {
-                        dEdR        = 0.0f;
-                        CDLJ_energy = 0.0f;
-                    }
-#endif
-		              energy         += 0.5f*CDLJ_energy;
-                    dx             *= dEdR;
-                    dy             *= dEdR;
-                    dz             *= dEdR;
-                    af.x           -= dx;
-                    af.y           -= dy;
-                    af.z           -= dz;
-                }
-
-            } else  {
-
-                unsigned int xi   = x>>GRIDBITS;
-                unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
-                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
-
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    dx              = psA[j].x - apos.x;
-                    dy              = psA[j].y - apos.y;
-                    dz              = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx             -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy             -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz             -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    r2              = dx * dx + dy * dy + dz * dz;
-                    invR            = 1.0f / sqrtf(r2);
-                    sig             = a.x + psA[j].sig;
-                    eps             = a.y * psA[j].eps;
-#ifdef USE_SOFTCORE_LJ
-                    dEdR            = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJ_energy );
-#else
-                    float sig2      = invR * sig;
-                    sig2           *= sig2;
-                    float sig6      = sig2 * sig2 * sig2;
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
-                    CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
-#endif
-
-#ifdef USE_CUTOFF
-                    dEdR           += a.w * psA[j].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
-		              CDLJ_energy    += a.w * psA[j].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
-#else
-                    dEdR           += a.w * psA[j].q * invR;
-		              CDLJ_energy    += a.w * psA[j].q * invR;
-#endif
-                    dEdR           *= invR * invR;
-#ifdef USE_CUTOFF
-                    if (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr)
-#else
-                    if (!(excl & 0x1))
-#endif
-                    {
-                        dEdR = 0.0f;
-		                  CDLJ_energy  = 0.0f;
-                    }
-
-                    energy         += 0.5f*CDLJ_energy;
-                    dx             *= dEdR;
-                    dy             *= dEdR;
-                    dz             *= dEdR;
-                    af.x           -= dx;
-                    af.y           -= dy;
-                    af.z           -= dz;
-                    excl          >>= 1;
-                }
-            }
-
-            // Write results
-            float4 of;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                 = x + tgx + warp*cSim.stride;
-            of                                  = cSim.pForce4[offset];
-            of.x                               += af.x;
-            of.y                               += af.y;
-            of.z                               += af.z;
-            cSim.pForce4[offset]                = of;
-#else
-            of.x                                = af.x;
-            of.y                                = af.y;
-            of.z                                = af.z;
-            of.w                                = 0.0f;
-            unsigned int offset                 = x + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pForce4[offset]                = of;
-#endif
-
-        } else {
-
-            // Read fixed atom data into registers and GRF
-            if (lasty != y)
-            {
-                unsigned int j                   = y + tgx;
-                float4 temp                      = cSim.pPosq[j];
-                //float2 temp1                     = cSim.pAttr[j];
-                float4 temp1                     = feSimDev.pSigEps4[j];
-                //float  temp3                     = cSim.pSoftCoreLJLambda[j];
-                //float  temp3                     = softCoreLJLambdaArray[j];
-                float temp3                      = temp1.z;
-                sA[threadIdx.x].x                = temp.x;
-                sA[threadIdx.x].y                = temp.y;
-                sA[threadIdx.x].z                = temp.z;
-                sA[threadIdx.x].q                = temp1.w;
-                sA[threadIdx.x].sig              = temp1.x;
-                sA[threadIdx.x].eps              = temp1.y;
-                sA[threadIdx.x].softCoreLJLambda = temp3;
-            }
-            sA[threadIdx.x].fx      = 0.0f;
-            sA[threadIdx.x].fy      = 0.0f;
-            sA[threadIdx.x].fz      = 0.0f;
-            a.w                *= cSim.epsfac;
-            if (!bExclusionFlag)
-            {
-#ifdef USE_CUTOFF
-                unsigned int flags = cSim.pInteractionFlag[pos];
-                if (flags == 0)
-                {
-                    // No interactions in this block.
-                }
-                else if (flags == 0xFFFFFFFF)
-#endif
-                {
-                    // Compute all interactions within this block.
-
-                    for (unsigned int j = 0; j < GRID; j++)
-                    {
-                        dx              = psA[tj].x - apos.x;
-                        dy              = psA[tj].y - apos.y;
-                        dz              = psA[tj].z - apos.z;
-#ifdef USE_PERIODIC
-                        dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                        dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                        dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                        r2              = dx * dx + dy * dy + dz * dz;
-                        invR            = 1.0f / sqrtf(r2);
-                        sig             = a.x + psA[tj].sig;
-                        eps             = a.y * psA[tj].eps;
-#ifdef USE_SOFTCORE_LJ
-                        dEdR            = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[tj].softCoreLJLambda, &CDLJ_energy );
-#else
-                        float sig2      = invR * sig;
-                        sig2           *= sig2;
-                        float sig6      = sig2 * sig2 * sig2;
-                        dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
-			               CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
-#endif
-#ifdef USE_CUTOFF
-                        dEdR           += a.w * psA[tj].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
-                        CDLJ_energy    += a.w * psA[tj].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
-#else
-                        dEdR           += a.w * psA[tj].q * invR;
-                        CDLJ_energy    += a.w * psA[tj].q * invR;
-#endif
-                        dEdR           *= invR * invR;
-#ifdef USE_CUTOFF
-                        if (r2 > cSim.nonbondedCutoffSqr)
-                        {
-                            dEdR = 0.0f;
-       			             CDLJ_energy = 0.0f;
-                        }
-#endif
-
-			               energy         += CDLJ_energy;
-                        dx             *= dEdR;
-                        dy             *= dEdR;
-                        dz             *= dEdR;
-                        af.x           -= dx;
-                        af.y           -= dy;
-                        af.z           -= dz;
-                        psA[tj].fx     += dx;
-                        psA[tj].fy     += dy;
-                        psA[tj].fz     += dz;
-                        tj              = (tj + 1) & (GRID - 1);
-                    }
-                }
-#ifdef USE_CUTOFF
-                else
-                {
-                    // Compute only a subset of the interactions in this block.
-
-                    for (unsigned int j = 0; j < GRID; j++)
-                    {
-                        if ((flags&(1<<j)) != 0)
-                        {
-                            dx              = psA[j].x - apos.x;
-                            dy              = psA[j].y - apos.y;
-                            dz              = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                            dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                            dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                            dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                            r2              = dx * dx + dy * dy + dz * dz;
-                            invR            = 1.0f / sqrtf(r2);
-                            sig             = a.x + psA[j].sig;
-                            eps             = a.y * psA[j].eps;
-#ifdef USE_SOFTCORE_LJ
-                            dEdR            = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJ_energy );
-#else
-                            float sig2      = invR * sig;
-                            sig2           *= sig2;
-                            float sig6      = sig2 * sig2 * sig2;
-                            dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
-                			    CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
-#endif
-#ifdef USE_CUTOFF
-                            dEdR           += a.w * psA[j].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
-                            CDLJ_energy    += a.w * psA[j].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
-#else
-                            dEdR           += a.w * psA[j].q * invR;
-                            CDLJ_energy    += a.w * psA[j].q * invR;
-#endif
-                            dEdR           *= invR * invR;
-#ifdef USE_CUTOFF
-                            if (r2 > cSim.nonbondedCutoffSqr)
-                            {
-                                dEdR = 0.0f;
-				                    CDLJ_energy = 0.0f;
-                            }
-#endif
-			                   energy         += CDLJ_energy;
-                            dx             *= dEdR;
-                            dy             *= dEdR;
-                            dz             *= dEdR;
-                            af.x           -= dx;
-                            af.y           -= dy;
-                            af.z           -= dz;
-                            tempBuffer[threadIdx.x].x = dx;
-                            tempBuffer[threadIdx.x].y = dy;
-                            tempBuffer[threadIdx.x].z = dz;
-
-                            // Sum the forces on atom j.
-
-                            if (tgx % 2 == 0)
-                            {
-                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
-                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
-                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
-                            }
-                            if (tgx % 4 == 0)
-                            {
-                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
-                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
-                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
-                            }
-                            if (tgx % 8 == 0)
-                            {
-                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
-                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
-                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
-                            }
-                            if (tgx % 16 == 0)
-                            {
-                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
-                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
-                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
-                            }
-                            if (tgx == 0)
-                            {
-                                psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
-                                psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
-                                psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
-                            }
-                        }
-                    }
-                }
-#endif
-            }
-            else  // bExclusion
-            {
-                // Read fixed atom data into registers and GRF
-                unsigned int xi   = x>>GRIDBITS;
-                unsigned int yi   = y>>GRIDBITS;
-                unsigned int cell          = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
-                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
-                excl              = (excl >> tgx) | (excl << (GRID - tgx));
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    dx              = psA[tj].x - apos.x;
-                    dy              = psA[tj].y - apos.y;
-                    dz              = psA[tj].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    r2              = dx * dx + dy * dy + dz * dz;
-                    invR            = 1.0f / sqrtf(r2);
-                    sig             = a.x + psA[tj].sig;
-                    eps             = a.y * psA[tj].eps;
-#ifdef USE_SOFTCORE_LJ
-                    dEdR            = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[tj].softCoreLJLambda, &CDLJ_energy );
-#else
-                    float sig2      = invR * sig;
-                    sig2           *= sig2;
-                    float sig6      = sig2 * sig2 * sig2;
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
-		              CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
-#endif
-
-#ifdef USE_CUTOFF
-                    dEdR           += a.w * psA[tj].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
-	                 CDLJ_energy    += a.w * psA[tj].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
-#else
-                    dEdR           += a.w * psA[tj].q * invR;
-                    CDLJ_energy    += a.w * psA[tj].q * invR;
-#endif
-                    dEdR           *= invR * invR;
-#ifdef USE_CUTOFF
-                    if (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr)
-#else
-                    if (!(excl & 0x1))
-#endif
-                    {
-                        dEdR = 0.0f;			
-	                     CDLJ_energy  = 0.0f;
-                    }
-
-		              energy         += CDLJ_energy;
-                    dx             *= dEdR;
-                    dy             *= dEdR;
-                    dz             *= dEdR;
-                    af.x           -= dx;
-                    af.y           -= dy;
-                    af.z           -= dz;
-                    psA[tj].fx     += dx;
-                    psA[tj].fy     += dy;
-                    psA[tj].fz     += dz;
-                    excl          >>= 1;
-                    tj              = (tj + 1) & (GRID - 1);
-                }
-            }
-
-            // Write results
-            float4 of;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                          = x + tgx + warp*cSim.stride;
-            of                                  = cSim.pForce4[offset];
-            of.x                               += af.x;
-            of.y                               += af.y;
-            of.z                               += af.z;
-            cSim.pForce4[offset]                = of;
-            offset                              = y + tgx + warp*cSim.stride;
-            of                                  = cSim.pForce4[offset];
-            of.x                               += sA[threadIdx.x].fx;
-            of.y                               += sA[threadIdx.x].fy;
-            of.z                               += sA[threadIdx.x].fz;
-            cSim.pForce4[offset]                = of;
-#else
-            of.x                                = af.x;
-            of.y                                = af.y;
-            of.z                                = af.z;
-            of.w                                = 0.0f;
-            unsigned int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
-            cSim.pForce4[offset]                = of;
-            of.x                                = sA[threadIdx.x].fx;
-            of.y                                = sA[threadIdx.x].fy;
-            of.z                                = sA[threadIdx.x].fz;
-            offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pForce4[offset]                = of;
-#endif
-            lasty = y;
-        }
-
-        pos++;
-    }
-    cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
-}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreBornSum.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreBornSum.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include "gputypes.h"
-#include "freeEnergyGpuTypes.h"
-#include "GpuFreeEnergyCudaKernels.h"
-#include "kernels/cudaKernels.h"
-#include "openmm/OpenMMException.h"
-
-#include <cuda.h>
-
-#define PARAMETER_PRINT 0
-#define MAX_PARAMETER_PRINT 10
-
-struct Atom {
-    float x;
-    float y;
-    float z;
-    float r;
-    float sr;
-    float sum;
-    float polarScaleData;
-};
-
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ cudaFreeEnergyGmxSimulation gbsaSimDev;
-
-extern "C" void SetCalculateObcGbsaSoftcoreBornSumSim( freeEnergyGpuContext freeEnergyGpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol( cSim, &freeEnergyGpu->gpuContext->sim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateObcGbsaSoftcoreBornSumSim copy to cSim failed.");
-
-    status = cudaMemcpyToSymbol( gbsaSimDev, &freeEnergyGpu->freeEnergySim, sizeof(cudaFreeEnergyGmxSimulation));
-    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateObcGbsaSoftcoreBornSumSim copy to gbsaSimDev failed.");
-}
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void kClearObcGbsaSoftcoreBornSum_kernel()
-{
-    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    while (pos < cSim.stride * cSim.nonbondOutputBuffers)
-    {
-        ((float*)cSim.pBornSum)[pos]   = 0.0f;
-        pos                           += gridDim.x * blockDim.x;
-    }
-}
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void kClearSoftcoreBornForces_kernel()
-{
-    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    while (pos < cSim.stride * cSim.nonbondOutputBuffers)
-    {   
-        ((float*)cSim.pBornForce)[pos]  = 0.0f;
-        pos                            += gridDim.x * blockDim.x;
-    }   
-}
-
-void kClearSoftcoreBornForces(gpuContext gpu)
-{
-  //  printf("kClearSoftcoreBornForces\n");
-    kClearSoftcoreBornForces_kernel<<<gpu->sim.blocks, gpu->sim.threads_per_block>>>();
-    LAUNCHERROR("kClearSoftcoreBornForces");
-}
-
-void kClearObcGbsaSoftcoreBornSum(gpuContext gpu)
-{
-  //  printf("kClearObcGbsaBornSum\n");
-    kClearObcGbsaSoftcoreBornSum_kernel<<<gpu->sim.blocks, gpu->sim.threads_per_block>>>();
-}
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void kReduceObcGbsaSoftcoreBornForces_kernel()
-{
-    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
-    float energy     = 0.0f;
-
-    while (pos < cSim.atoms)
-    {
-        float bornRadius         = cSim.pBornRadii[pos];
-        float obcChain           = cSim.pObcChain[pos];
-        float2 obcData           = cSim.pObcData[pos];
-        float  nonPolarScaleData = gbsaSimDev.pNonPolarScalingFactors[pos];
-        float totalForce         = 0.0f;
-        float* pFt               = cSim.pBornForce + pos;
-
-        int i                    = cSim.nonbondOutputBuffers;
-        while (i >= 4)
-        {
-            float f1    = *pFt;
-            pFt        += cSim.stride;
-            float f2    = *pFt;
-            pFt        += cSim.stride;
-            float f3    = *pFt;
-            pFt        += cSim.stride;
-            float f4    = *pFt;
-            pFt        += cSim.stride;
-            totalForce += f1 + f2 + f3 + f4;
-            i -= 4;
-        }
-        if (i >= 2)
-        {
-            float f1    = *pFt;
-            pFt        += cSim.stride;
-            float f2    = *pFt;
-            pFt        += cSim.stride;
-            totalForce += f1 + f2;
-            i -= 2;
-        }
-        if (i > 0)
-        {
-            totalForce += *pFt;
-        }
-        float r            = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
-        float ratio6       = pow((obcData.x + cSim.dielectricOffset) / bornRadius, 6.0f);
-        float saTerm       = nonPolarScaleData*cSim.surfaceAreaFactor * r * r * ratio6;
-        totalForce        += saTerm / bornRadius; // 1.102 == Temp mysterious fudge factor, FIX FIX FIX
-
-        energy            += saTerm;
-
-        totalForce        *= bornRadius * bornRadius * obcChain;
-
-        pFt                = cSim.pBornForce + pos;
-        *pFt               = totalForce;
-        pos               += gridDim.x * blockDim.x;
-    }
-
-    // correct for surface area factor of -6
-    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f;
-}
-
-
-void kReduceObcGbsaSoftcoreBornForces( gpuContext gpu ){
-
-    kReduceObcGbsaSoftcoreBornForces_kernel<<<gpu->sim.blocks, gpu->sim.threads_per_block>>>();
-    LAUNCHERROR("kReduceObcGbsaSoftcoreBornForces");
-}
-
-
-// Include versions of the kernels for N^2 calculations.
-
-#define METHOD_NAME(a, b) a##N2##b
-#include "kCalculateObcGbsaSoftcoreBornSum.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateObcGbsaSoftcoreBornSum.h"
-
-// Include versions of the kernels with cutoffs.
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_CUTOFF
-#define METHOD_NAME(a, b) a##Cutoff##b
-#include "kCalculateObcGbsaSoftcoreBornSum.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##CutoffByWarp##b
-#include "kCalculateObcGbsaSoftcoreBornSum.h"
-
-// Include versions of the kernels with periodic boundary conditions.
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_PERIODIC
-#define METHOD_NAME(a, b) a##Periodic##b
-#include "kCalculateObcGbsaSoftcoreBornSum.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##PeriodicByWarp##b
-#include "kCalculateObcGbsaSoftcoreBornSum.h"
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void kReduceObcGbsaSoftcoreBornSum_kernel()
-{
-    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
-    
-    while (pos < cSim.atoms)
-    {
-        float sum   = 0.0f;
-        float* pSt  = cSim.pBornSum + pos;
-        float2 atom = cSim.pObcData[pos];
-        
-        // Get summed Born data
-        for( int i = 0; i < cSim.nonbondOutputBuffers; i++ ){
-            sum += *pSt;
-            pSt += cSim.stride;
-        }
-        
-        
-        // Now calculate Born radius and OBC term.
-        sum                    *= 0.5f * atom.x;
-        float sum2              = sum * sum;
-        float sum3              = sum * sum2;
-        float tanhSum           = tanh(cSim.alphaOBC * sum - cSim.betaOBC * sum2 + cSim.gammaOBC * sum3);
-        float nonOffsetRadii    = atom.x + cSim.dielectricOffset;
-        float bornRadius        = 1.0f / (1.0f / atom.x - tanhSum / nonOffsetRadii); 
-        float obcChain          = atom.x * (cSim.alphaOBC - 2.0f * cSim.betaOBC * sum + 3.0f * cSim.gammaOBC * sum2);
-        obcChain                = (1.0f - tanhSum * tanhSum) * obcChain / nonOffsetRadii;        
-        cSim.pBornRadii[pos]    = bornRadius;
-        cSim.pObcChain[pos]     = obcChain;
-        pos                    += gridDim.x * blockDim.x;
-    }   
-}
-
-void kReduceObcGbsaSoftcoreBornSum(gpuContext gpu)
-{
-//    printf("kReduceObcGbsaSoftcoreBornSum\n");
-    kReduceObcGbsaSoftcoreBornSum_kernel<<<gpu->sim.blocks,  gpu->sim.threads_per_block>>>();
-    gpu->bRecalculateBornRadii = false;
-    LAUNCHERROR("kReduceObcGbsaSoftcoreBornSum");
-}
-
-/** 
- * Initialize parameters for Cuda Obc softcore
- * 
- * @param freeEnergyGpu        freeEnergyGpu context
- * @param innerDielectric      solute dielectric
- * @param solventDielectric    solvent dielectric
- * @param radius               intrinsic Born radii
- * @param scale                Obc scaling factors
- * @param charge               atomic charges (possibly overwritten by other methods?)
- * @param nonPolarScalingFactors non-polar scaling factors
- *
- */
-
-extern "C"
-void  gpuSetObcSoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float innerDielectric, float solventDielectric, float nonPolarPrefactor,
-                                   const std::vector<float>& radius, const std::vector<float>& scale,
-                                   const std::vector<float>& charge, const std::vector<float>& nonPolarScalingFactors)
-{
-
-// ---------------------------------------------------------------------------------------
-
-   static const float dielectricOffset    =    0.009f;
-   static const float electricConstant    = -166.02691f;
-   static const std::string methodName    = "gpuSetObcSoftcoreParameters";
-
-// ---------------------------------------------------------------------------------------
-
-    unsigned int numberOfParticles                       = radius.size();
-    gpuContext gpu                                       = freeEnergyGpu->gpuContext;
-
-    // initialize parameters
-
-    freeEnergyGpu->psNonPolarScalingFactors              = new CUDAStream<float>( gpu->sim.paddedNumberOfAtoms, 1, "ObcSoftcoreNonPolarScaling");
-    freeEnergyGpu->freeEnergySim.pNonPolarScalingFactors = freeEnergyGpu->psNonPolarScalingFactors->_pDevData;
-
-    gpu->sim.surfaceAreaFactor                           =  -6.0f*PI*4.0f*nonPolarPrefactor;
-    gpu->sim.preFactor                                   = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
-
-    for( unsigned int ii = 0; ii < numberOfParticles; ii++ ){
-        (*gpu->psObcData)[ii].x                        = radius[ii] - dielectricOffset;
-        (*gpu->psObcData)[ii].y                        = scale[ii] * (*gpu->psObcData)[ii].x;
-        (*gpu->psPosq4)[ii].w                          = charge[ii];
-        (*gpu->psBornRadii)[ii]                        = 0.0f;
-        (*freeEnergyGpu->psNonPolarScalingFactors)[ii] = nonPolarScalingFactors[ii];
-    }
-
-    // diagnostics
-
-    if( freeEnergyGpu->log ){
-        (void) fprintf( freeEnergyGpu->log, "%s %u %u\n", methodName.c_str(), gpu->natoms, gpu->sim.paddedNumberOfAtoms );
-        (void) fprintf( freeEnergyGpu->log, "surfaceAreaFactor=%15.7e preFactor=%15.7e\n", gpu->sim.surfaceAreaFactor, gpu->sim.preFactor);
-#ifdef PARAMETER_PRINT
-        int maxPrint = MAX_PARAMETER_PRINT;
-        for( unsigned int ii = 0; ii < numberOfParticles; ii++ ){
-            (void) fprintf( freeEnergyGpu->log, "%6u %13.6e %13.6e %8.3f %8.3f\n", ii, 
-                            (*gpu->psObcData)[ii].x, (*gpu->psObcData)[ii].y, (*gpu->psPosq4)[ii].w, (*freeEnergyGpu->psNonPolarScalingFactors)[ii] );
-             if( ii == maxPrint ){
-                ii = numberOfParticles - maxPrint;
-                if( ii < maxPrint )ii = maxPrint;
-            }
-        }
-#endif
-    }
-
-    // dummy out extra atom data
-
-    for (unsigned int ii = gpu->natoms; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
-        (*gpu->psObcData)[ii].x                         = 0.01f;
-        (*gpu->psObcData)[ii].y                         = 0.01f;
-        (*freeEnergyGpu->psNonPolarScalingFactors)[ii]  = 0.0f;
-        (*gpu->psBornRadii)[ii]                         = 0.0f;
-    }
-
-    // load data to board
-
-    gpu->psObcData->Upload();
-    gpu->psPosq4->Upload();
-    gpu->psBornRadii->Upload();
-    freeEnergyGpu->psNonPolarScalingFactors->Upload();
-
-    return;
-}
-
-void kPrintObcGbsaSoftcore( freeEnergyGpuContext freeEnergyGpu, std::string callId, int call, FILE* log){
-
-    gpuContext gpu = freeEnergyGpu->gpuContext;
-    int maxPrint   = gpu->natoms;
-
-    (void) fprintf( log, "kPrintObcGbsaSoftcore %s %d\n", callId.c_str(), call );
-
-    gpu->psObcData->Download();
-    gpu->psBornRadii->Download();
-    gpu->psBornForce->Download();
-    gpu->psPosq4->Download();
-    freeEnergyGpu->psNonPolarScalingFactors->Download();
-
-    CUDAStream<float4>* sigEps4          = freeEnergyGpu->psSigEps4;
-    sigEps4->Download();
-
-    (void) fprintf( log, "BornSum Born radii & params\n" );
-    for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
-        //(void) fprintf( log, "%6d prm[%15.7e %15.7e %15.7e %15.7e] [%15.7e %15.7e %15.7e %15.7e] bR=%15.7e bF=%15.7e swDrv=%3.1f x[%8.3f %8.3f %8.3f %15.7f]\n",
-        (void) fprintf( log, "%6d prm[%15.7e %15.7e %15.7e] sig/eps4[%15.7e %15.7e %15.7e %15.7e] bR=%15.7e bF=%15.7e\n",
-                        ii,
-
-                        gpu->psObcData->_pSysData[ii].x,
-                        gpu->psObcData->_pSysData[ii].y,
-                        freeEnergyGpu->psNonPolarScalingFactors->_pSysData[ii],
-
-                        sigEps4->_pSysData[ii].x,
-                        sigEps4->_pSysData[ii].y,
-                        sigEps4->_pSysData[ii].z,
-                        sigEps4->_pSysData[ii].w,
-
-                        gpu->psBornRadii->_pSysData[ii],
-                        gpu->psBornForce->_pSysData[ii]
-/*
-                        gpu->psPosq4->_pSysData[ii].x,
-                        gpu->psPosq4->_pSysData[ii].y,
-                        gpu->psPosq4->_pSysData[ii].z,
-                        gpu->psPosq4->_pSysData[ii].w );
-*/
-                        );
-
-        if( (ii == maxPrint) && ( ii < (gpu->natoms - maxPrint)) ){
-            ii = gpu->natoms - maxPrint;
-        }
-    }
-
-}
-extern __global__ void kFindBlockBoundsCutoff_kernel();
-extern __global__ void kFindBlockBoundsPeriodic_kernel();
-
-extern __global__ void kFindBlocksWithInteractionsCutoff_kernel();
-extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
-
-extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*);
-extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*);
-
-void kCalculateObcGbsaSoftcoreBornSum( freeEnergyGpuContext freeEnergyGpu )
-{
-  //  printf("kCalculateObcGbsaSoftcoreBornSum\n");
-    gpuContext gpu = freeEnergyGpu->gpuContext;
-
-    kClearObcGbsaSoftcoreBornSum(gpu);
-    LAUNCHERROR("kClearBornSum from kCalculateObcGbsaSoftcoreBornSum");
-
-    switch ( freeEnergyGpu->freeEnergySim.nonbondedMethod )
-    {
-        case FREE_ENERGY_NO_CUTOFF:
-
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateObcGbsaSoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
-            else
-                kCalculateObcGbsaSoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
-
-            break;
-
-        case FREE_ENERGY_CUTOFF:
-
-            kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
-            LAUNCHERROR("kFindBlockBoundsCutoff");
-            kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
-            LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
-            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-            kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-
-
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateObcGbsaSoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateObcGbsaSoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-
-            break;
-
-        case FREE_ENERGY_PERIODIC:
-
-            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
-            LAUNCHERROR("kFindBlockBoundsPeriodic");
-            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
-            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
-            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateObcGbsaSoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateObcGbsaSoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-
-
-            break;
-
-        default:
-            throw OpenMM::OpenMMException( "Nonbonded softcore method not recognized." );
-
-    }
-    LAUNCHERROR("kCalculateObcGbsaSoftcoreBornSum");
-
-}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreBornSum.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateObcGbsaSoftcoreBornSum.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-/**
- * This file contains the kernel for calculating Born sums.  It is included
- * several times in kCalculateObcGbsaBornSum.cu with different #defines to generate
- * different versions of the kernels.
- */
-
-#undef TARGET
-//#define TARGET 1
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
-#endif
-void METHOD_NAME(kCalculateObcGbsaSoftcore, BornSum_kernel)(unsigned int* workUnit)
-{
-    extern __shared__ Atom sA[];
-    unsigned int totalWarps   = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
-    unsigned int warp         = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    unsigned int numWorkUnits = cSim.pInteractionCount[0];
-    unsigned int pos          = warp*numWorkUnits/totalWarps;
-    unsigned int end          = (warp+1)*numWorkUnits/totalWarps;
-
-#ifdef USE_CUTOFF
-    float* tempBuffer         = (float*) &sA[cSim.nonbond_threads_per_block];
-#endif
-
-    while (pos < end)
-    {
-        // Extract cell coordinates from appropriate work unit
-        
-        //unsigned int x = workUnit[pos + (blockIdx.x*numWorkUnits)/gridDim.x];
-        unsigned int x = workUnit[pos];
-        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
-        x              = (x >> 17) << GRIDBITS;
-
-        float       dx;
-        float       dy;
-        float       dz;
-
-        float       r2;
-        float       r;
-
-        unsigned int tgx = threadIdx.x & (GRID - 1);
-        unsigned int tbx = threadIdx.x - tgx;
-        unsigned int tj  = tgx;
-
-        Atom* psA        = &sA[tbx];
-
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        {
-            // Read fixed atom data into registers and GRF
-            unsigned int i                          = x + tgx;
-            float4 apos                             = cSim.pPosq[i];    // Local atom x, y, z, sum
-            float2 ar                               = cSim.pObcData[i];   // Local atom vr, sr
-            float polarScaleData                    = gbsaSimDev.pNonPolarScalingFactors[i];  // scale contribution
-
-            sA[threadIdx.x].x                       = apos.x;
-            sA[threadIdx.x].y                       = apos.y;
-            sA[threadIdx.x].z                       = apos.z;
-
-            sA[threadIdx.x].r                       = ar.x;
-            sA[threadIdx.x].sr                      = ar.y;
-            sA[threadIdx.x].polarScaleData          = polarScaleData;
-            apos.w                                  = 0.0f;
-
-            for (unsigned int j = 0; j < GRID; j++)
-            {
-                dx                      = psA[j].x - apos.x;
-                dy                      = psA[j].y - apos.y;
-                dz                      = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                dx                     -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                dy                     -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                dz                     -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-
-                r2                      = dx * dx + dy * dy + dz * dz;
-
-#if defined USE_CUTOFF
-                if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#else
-                if (i < cSim.atoms && x+j < cSim.atoms )
-#endif
-                {
-                    r                       = sqrtf(r2);
-                    float rInverse          = 1.0f/r;
-                    float rScaledRadiusJ    = r + psA[j].sr;
-                    if( (j != tgx) && (ar.x < rScaledRadiusJ) ){
-                        float l_ij     = 1.0f / max(ar.x, fabs(r - psA[j].sr));
-                        float u_ij     = 1.0f / rScaledRadiusJ;
-                        float l_ij2    = l_ij * l_ij;
-                        float u_ij2    = u_ij * u_ij;
-                        float ratio    = logf(u_ij / l_ij);
-                        float term     = l_ij - u_ij +
-                                         0.25f * r * (u_ij2 - l_ij2) +
-                                         (0.50f * rInverse * ratio) +
-                                         (0.25f * psA[j].sr * psA[j].sr * rInverse) *
-                                         (l_ij2 - u_ij2);
-                        float rj       = psA[j].sr;
-                        if( ar.x < (rj - r) ){
-                            term += 2.0f * ((1.0f / ar.x) - l_ij);
-                        }
-                        apos.w += psA[j].polarScaleData*term;
-                    }
-                }
-            }
-
-            // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset    = x + tgx + warp*cSim.stride;
-            cSim.pBornSum[offset] += apos.w;
-#else
-            unsigned int offset   = x + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = apos.w;
-#endif
-
-        } else {
-
-            // Read fixed atom data into registers and GRF
-
-            unsigned int j                  = y + tgx;
-            unsigned int i                  = x + tgx;
-
-            float4 temp                     = cSim.pPosq[j];
-            float2 temp1                    = cSim.pObcData[j];
-            float polarScaleDataJ           = gbsaSimDev.pNonPolarScalingFactors[j];  // scale contribution
-
-            float4 apos                     = cSim.pPosq[i];        // Local atom x, y, z, sum
-            apos.w                          = 0.0f;
-
-            float2 ar                       = cSim.pObcData[i];    // Local atom vr, sr
-            float polarScaleDataI           = gbsaSimDev.pNonPolarScalingFactors[i];  // scale contribution
-
-            sA[threadIdx.x].x               = temp.x;
-            sA[threadIdx.x].y               = temp.y;
-            sA[threadIdx.x].z               = temp.z;
-
-            sA[threadIdx.x].r               = temp1.x;
-            sA[threadIdx.x].sr              = temp1.y;
-
-            sA[threadIdx.x].polarScaleData  = polarScaleDataJ;
-
-            sA[threadIdx.x].sum             = 0.0f;
-
-#ifdef USE_CUTOFF
-            unsigned int flags              = cSim.pInteractionFlag[pos];
-            if (flags == 0)
-            {
-                // No interactions in this block.
-            }
-//            else if (flags == 0xFFFFFFFF)
-            else if (flags )
-#endif
-            {
-                // Compute all interactions within this block.
-
-                for( unsigned int j = 0; j < GRID; j++ ){
-
-                    dx  = psA[tj].x - apos.x;
-                    dy  = psA[tj].y - apos.y;
-                    dz  = psA[tj].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    r2                      = dx * dx + dy * dy + dz * dz;
-#ifdef USE_CUTOFF
-                    if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#else
-                    if (i < cSim.atoms && y+tj < cSim.atoms )
-#endif
-                    {
-                        r                       = sqrtf(r2);
-                        float rInverse          = 1.0f / r;
-                        float rScaledRadiusJ    = r + psA[tj].sr;
-                        if (ar.x < rScaledRadiusJ)
-                        {
-                            float l_ij     = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
-                            float u_ij     = 1.0f / rScaledRadiusJ;
-                            float l_ij2    = l_ij * l_ij;
-                            float u_ij2    = u_ij * u_ij;
-                            float ratio    = logf(u_ij / l_ij);
-                            float term     = l_ij - u_ij +
-                                             0.25f * r * (u_ij2 - l_ij2) +
-                                             (0.50f * rInverse * ratio) +
-                                             (0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
-                                             (l_ij2 - u_ij2);
-                            float srj      = psA[tj].sr;
-                            float scale    = psA[tj].polarScaleData;
-                            if (ar.x < (srj - r))
-                            {
-                                term += 2.0f * ((1.0f / ar.x) - l_ij);
-                            }
-                            apos.w        += (scale*term);
-
-                        }
-                        float rScaledRadiusI    = r + ar.y;
-                        if (psA[tj].r < rScaledRadiusI)
-                        {
-                            float l_ij     = 1.0f / max(psA[tj].r, fabs(r - ar.y));
-                            float u_ij     = 1.0f / rScaledRadiusI;
-                            float l_ij2    = l_ij * l_ij;
-                            float u_ij2    = u_ij * u_ij;
-                            float ratio    = logf(u_ij / l_ij);
-                            float term     = l_ij - u_ij +
-                                             0.25f * r * (u_ij2 - l_ij2) +
-                                             (0.50f * rInverse * ratio) +
-                                             (0.25f * ar.y * ar.y * rInverse) *
-                                             (l_ij2 - u_ij2);
-
-                            float rj = psA[tj].r;
-                            if (rj < (ar.y - r))
-                            {
-                                term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
-                            }
-                            psA[tj].sum    += polarScaleDataI*term;
-
-                        }
-                    }
-                    tj = (tj - 1) & (GRID - 1);
-                }
-            }
-#ifdef USE_CUTOFF
-            else
-            {
-                // Compute only a subset of the interactions in this block.
-
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    if ((flags&(1<<j)) != 0)
-                    {
-                        tempBuffer[threadIdx.x] = 0.0f;
-                        dx                      = psA[j].x - apos.x;
-                        dy                      = psA[j].y - apos.y;
-                        dz                      = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                        dx                     -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                        dy                     -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                        dz                     -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                        r2                      = dx * dx + dy * dy + dz * dz;
-#ifdef USE_CUTOFF
-                        if (i < cSim.atoms && y+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#else
-                        if (i < cSim.atoms && y+j < cSim.atoms )
-#endif
-                        {
-                            r                       = sqrtf(r2);
-                            float rInverse          = 1.0f / r;
-                            float rScaledRadiusJ    = r + psA[j].sr;
-                            if (ar.x < rScaledRadiusJ)
-                            {
-                                float l_ij     = 1.0f / max(ar.x, fabs(r - psA[j].sr));
-                                float u_ij     = 1.0f / rScaledRadiusJ;
-                                float l_ij2    = l_ij * l_ij;
-                                float u_ij2    = u_ij * u_ij;
-                                float ratio    = logf(u_ij / l_ij);
-                                float term     = l_ij -
-                                                 u_ij +
-                                                 0.25f * r * (u_ij2 - l_ij2) +
-                                                 (0.50f * rInverse * ratio) +
-                                                 (0.25f * psA[j].sr * psA[j].sr * rInverse) *
-                                                 (l_ij2 - u_ij2);
-                                float srj = psA[j].sr;
-                                if (ar.x < (srj - r))
-                                {
-                                    term += 2.0f * ((1.0f / ar.x) - l_ij);
-                                }
-                                apos.w        += psA[j].polarScaleData*term;
-                            }
-                            float rScaledRadiusI    = r + ar.y;
-                            if (psA[j].r < rScaledRadiusI)
-                            {
-                                float l_ij     = 1.0f / max(psA[j].r, fabs(r - ar.y));
-                                float u_ij     = 1.0f / rScaledRadiusI;
-                                float l_ij2    = l_ij * l_ij;
-                                float u_ij2    = u_ij * u_ij;
-                                float ratio    = logf(u_ij / l_ij);
-                                float term     = l_ij -
-                                                 u_ij +
-                                                 0.25f * r * (u_ij2 - l_ij2) +
-                                                 (0.50f * rInverse * ratio) +
-                                                 (0.25f * ar.y * ar.y * rInverse) *
-                                                 (l_ij2 - u_ij2);
-                                float rj = psA[j].r;
-                                if (rj < (ar.y - r))
-                                {
-                                    term += 2.0f * ((1.0f / psA[j].r) - l_ij);
-                                }
-                                tempBuffer[threadIdx.x] = polarScaleDataI*term;
-                            }
-                        }
-
-                        // Sum the terms.
-
-                        if (tgx % 2 == 0)
-                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
-                        if (tgx % 4 == 0)
-                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
-                        if (tgx % 8 == 0)
-                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
-                        if (tgx % 16 == 0)
-                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
-                        if (tgx == 0)
-                            psA[j].sum += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
-                    }
-                }
-            }
-#endif
-
-            // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset = x + tgx + warp*cSim.stride;
-            cSim.pBornSum[offset] += apos.w;
-            offset = y + tgx + warp*cSim.stride;
-            cSim.pBornSum[offset] += sA[threadIdx.x].sum;
-#else
-            unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = apos.w;
-            offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = sA[threadIdx.x].sum;
-#endif
-        }
-
-        pos++;
-    }
-}