dded GB/VI to Cuda platform

Free energy plugin added Plugin will not run w/ Obc or GB/VI forces unless line 2004 of gpu.cpp (gpu->sim.totalNonbondOutputBuffers = 2*gpu->sim.nonbondOutputBuffers;) is commented in -- working on removing this constraint Also unit tests for GB/VI currently fail

dded GB/VI to Cuda platform
Free energy plugin added Plugin will not run w/ Obc or GB/VI forces unless line 2004 of gpu.cpp (gpu->sim.totalNonbondOutputBuffers = 2*gpu->sim.nonbondOutputBuffers;) is commented in -- working on removing this constraint Also unit tests for GB/VI currently fail
7a36f461 · Mark Friedrichs · 43ebedfb · 7a36f461 · 7a36f461 · 7a36f461
Commit 7a36f461 authored Nov 03, 2009 by Mark Friedrichs
20 changed files
--- a/plugins/freeEnergy/platforms/cuda/cuda-cmake/empty.depend.in
+++ b/plugins/freeEnergy/platforms/cuda/cuda-cmake/empty.depend.in
--- a/plugins/freeEnergy/platforms/cuda/cuda-cmake/make2cmake.cmake
+++ b/plugins/freeEnergy/platforms/cuda/cuda-cmake/make2cmake.cmake
+#  For more information, please see: http://software.sci.utah.edu
+#
+#  The MIT License
+#
+#  Copyright (c) 2007
+#  Scientific Computing and Imaging Institute, University of Utah
+#
+#  License for the specific language governing rights and limitations under
+#  Permission is hereby granted, free of charge, to any person obtaining a
+#  copy of this software and associated documentation files (the "Software"),
+#  to deal in the Software without restriction, including without limitation
+#  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+#  and/or sell copies of the Software, and to permit persons to whom the
+#  Software is furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be included
+#  in all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+#  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+#  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+#  DEALINGS IN THE SOFTWARE.
+# Make2cmake CMake Script
+# Abe Stephens and James Bigler
+# (c) 2007 Scientific Computing and Imaging Institute, University of Utah
+# Note that the REGEX expressions may need to be tweaked for different dependency generators.
+FILE(READ ${input_file} depend_text)
+IF (${depend_text} MATCHES ".+")
+  # MESSAGE("FOUND DEPENDS")
+  # Remember, four backslashes is escaped to one backslash in the string.
+  STRING(REGEX REPLACE "\\\\ " " " depend_text ${depend_text})
+  # This works for the nvcc -M generated dependency files.
+  STRING(REGEX REPLACE "^.* : " "" depend_text ${depend_text})
+  STRING(REGEX REPLACE "[ \\\\]*\n" ";" depend_text ${depend_text})
+  FOREACH(file ${depend_text})
+    STRING(REGEX REPLACE "^ +" "" file ${file})
+    # IF (EXISTS ${file})
+	  #   MESSAGE("DEPEND = ${file}")    
+    # ELSE (EXISTS ${file})
+	  #   MESSAGE("ERROR = ${file}")
+    # ENDIF(EXISTS ${file})
+    SET(cuda_nvcc_depend "${cuda_nvcc_depend} \"${file}\"\n")
+  ENDFOREACH(file) 
+ELSE(${depend_text} MATCHES ".+") 
+  # MESSAGE("FOUND NO DEPENDS")
+ENDIF(${depend_text} MATCHES ".+")
+FILE(WRITE ${output_file} "# Generated by: make2cmake.cmake\nSET(CUDA_NVCC_DEPEND\n ${cuda_nvcc_depend})\n\n")
--- a/plugins/freeEnergy/platforms/cuda/cuda-cmake/parse_cubin.cmake
+++ b/plugins/freeEnergy/platforms/cuda/cuda-cmake/parse_cubin.cmake
+#  For more information, please see: http://software.sci.utah.edu
+#
+#  The MIT License
+#
+#  Copyright (c) 2007
+#  Scientific Computing and Imaging Institute, University of Utah
+#
+#  License for the specific language governing rights and limitations under
+#  Permission is hereby granted, free of charge, to any person obtaining a
+#  copy of this software and associated documentation files (the "Software"),
+#  to deal in the Software without restriction, including without limitation
+#  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+#  and/or sell copies of the Software, and to permit persons to whom the
+#  Software is furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be included
+#  in all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+#  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+#  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+#  DEALINGS IN THE SOFTWARE.
+# .cubin Parsing CMake Script
+# Abe Stephens
+# (c) 2007 Scientific Computing and Imaging Institute, University of Utah
+FILE(READ ${input_file} file_text)
+IF (${file_text} MATCHES ".+")
+  # Remember, four backslashes is escaped to one backslash in the string.
+  STRING(REGEX REPLACE ";" "\\\\;" file_text ${file_text})
+  STRING(REGEX REPLACE "\ncode" ";code" file_text ${file_text})
+  LIST(LENGTH file_text len)
+  FOREACH(line ${file_text})
+    # Only look at "code { }" blocks.
+    IF(line MATCHES "^code")
+      # Break into individual lines.
+      STRING(REGEX REPLACE "\n" ";" line ${line})
+      FOREACH(entry ${line})
+        # Extract kernel names.
+        IF (${entry} MATCHES "[^g]name = ([^ ]+)")
+          STRING(REGEX REPLACE ".* = ([^ ]+)" "\\1" entry ${entry})
+          # Check to see if the kernel name starts with "_"
+          SET(skip FALSE)
+          # IF (${entry} MATCHES "^_")
+            # Skip the rest of this block.
+            # MESSAGE("Skipping ${entry}")
+            # SET(skip TRUE)
+          # ELSE (${entry} MATCHES "^_")
+            MESSAGE("Kernel:    ${entry}")  
+          # ENDIF (${entry} MATCHES "^_")
+        ENDIF(${entry} MATCHES "[^g]name = ([^ ]+)")
+        # Skip the rest of the block if necessary
+        IF(NOT skip)
+          # Registers
+          IF (${entry} MATCHES "reg = ([^ ]+)")
+            STRING(REGEX REPLACE ".* = ([^ ]+)" "\\1" entry ${entry})
+            MESSAGE("Registers: ${entry}")
+          ENDIF(${entry} MATCHES "reg = ([^ ]+)")
+          # Local memory
+          IF (${entry} MATCHES "lmem = ([^ ]+)")
+            STRING(REGEX REPLACE ".* = ([^ ]+)" "\\1" entry ${entry})
+            MESSAGE("Local:     ${entry}")
+          ENDIF(${entry} MATCHES "lmem = ([^ ]+)")
+          # Shared memory
+          IF (${entry} MATCHES "smem = ([^ ]+)")
+            STRING(REGEX REPLACE ".* = ([^ ]+)" "\\1" entry ${entry})
+            MESSAGE("Shared:    ${entry}")
+          ENDIF(${entry} MATCHES "smem = ([^ ]+)")
+          IF (${entry} MATCHES "^}")
+            MESSAGE("")
+          ENDIF(${entry} MATCHES "^}")
+        ENDIF(NOT skip)
+      ENDFOREACH(entry)
+    ENDIF(line MATCHES "^code")
+  ENDFOREACH(line) 
+ELSE(${file_text} MATCHES ".+") 
+  # MESSAGE("FOUND NO DEPENDS")
+ENDIF(${file_text} MATCHES ".+")
--- a/plugins/freeEnergy/platforms/cuda/include/CudaFreeEnergyKernelFactory.h
+++ b/plugins/freeEnergy/platforms/cuda/include/CudaFreeEnergyKernelFactory.h
+#ifndef OPENMM_FREE_ENERGY_CUDA_KERNEL_FACTORY_H_
+#define OPENMM_FREE_ENERGY_CUDA_KERNEL_FACTORY_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "openmm/KernelFactory.h"
+namespace OpenMM {
+/**
+ * This KernelFactory creates all kernels for CudaFreeEnergyPlatform.
+ */
+class CudaFreeEnergyKernelFactory : public KernelFactory {
+public:
+    KernelImpl* createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const;
+};
+} // namespace OpenMM
+#endif /*OPENMM_FREE_ENERGY_CUDA_KERNEL_FACTORY_H_*/
--- a/plugins/freeEnergy/platforms/cuda/sharedTarget/CMakeLists.txt
+++ b/plugins/freeEnergy/platforms/cuda/sharedTarget/CMakeLists.txt
+#
+# Include CUDA related files.
+#
+# ----------------------------------------------------------------------------
+# logging
+SET(LOG TRUE)
+IF(LOG)
+   SET(LOG_FILE "CMakeLog.txt" )
+   FILE( WRITE ${LOG_FILE} "In freeEnergy/platforms/cuda/sharedTarget Cmake\n")
+#  FILE( APPEND ${LOG_FILE} "BROOK_LIB_PATH=${BROOK_LIB_PATH}\n")
+ENDIF(LOG)
+IF(LOG)
+   MACRO(LOG_DIR LOG_FILE DIR_LIST )
+      FILE( APPEND ${LOG_FILE} "\n${DIR_LIST}\n")
+      FOREACH(currentFile ${ARGN})
+         FILE( APPEND ${LOG_FILE} "   ${currentFile}\n" )
+      ENDFOREACH(currentFile)
+   ENDMACRO(LOG_DIR) 
+ENDIF(LOG)
+# ----------------------------------------------------------------------------
+SET(OPENMM_BUILD_FREE_ENERGY_PATH ${CMAKE_SOURCE_DIR}/plugins/freeEnergy) 
+# ----------------------------------------------------------------------------
+IF(LOG)
+   LOG_DIR( ${LOG_FILE} "Pre OPENMM_SOURCE_SUBDIRS" ${OPENMM_SOURCE_SUBDIRS} )
+   LOG_DIR( ${LOG_FILE} "Pre OPENMM_FREE_ENERGY_SOURCE_SUBDIRS " ${OPENMM_FREE_ENERGY_SOURCE_SUBDIRS} )
+   LOG_DIR( ${LOG_FILE} "Pre SOURCE_FILES" ${SOURCE_FILES} )
+ENDIF(LOG)
+## ----------------------------------------------------------------------------
+SET(CUDA_NVCC_BUILD_FLAGS) 
+INCLUDE(${FINDCUDA_DIR}/FindCuda.cmake)
+INCLUDE_DIRECTORIES(${CUDA_INCLUDE})
+LINK_DIRECTORIES(${CUDA_TARGET_LINK})
+FOREACH(subdir ${OPENMM_FREE_ENERGY_SOURCE_SUBDIRS})
+    FILE(GLOB src_files ${OPENMM_BUILD_FREE_ENERGY_PATH}/platforms/cuda/${subdir}/src/*.cu ${OPENMM_BUILD_FREE_ENERGY_PATH}/platforms/cuda/src/*/*.cu)
+    FOREACH(file ${src_files})
+        FILE(RELATIVE_PATH file ${OPENMM_BUILD_FREE_ENERGY_PATH}/platforms/cuda ${file})
+        SET(SOURCE_FILES         ${SOURCE_FILES}         ${file})   #append
+    ENDFOREACH(file)
+    CUDA_INCLUDE_DIRECTORIES(BEFORE ${OPENMM_BUILD_FREE_ENERGY_PATH}/platforms/cuda/../${subdir}/include)
+ENDFOREACH(subdir)
+# ----------------------------------------------------------------------------
+IF(LOG)
+   LOG_DIR( ${LOG_FILE} "OPENMM_BUILD_FREE_ENERGY_PATH" ${OPENMM_BUILD_FREE_ENERGY_PATH} )
+   FILE( APPEND ${LOG_FILE} "OPENMM_BUILD_FREE_ENERGY_PATH=${OPENMM_BUILD_FREE_ENERGY_PATH}\n")
+   LOG_DIR( ${LOG_FILE} "OPENMM_SOURCE_SUBDIRS" ${OPENMM_SOURCE_SUBDIRS} )
+   LOG_DIR( ${LOG_FILE} "CMAKE_SOURCE_DIR" ${CMAKE_SOURCE_DIR} )
+   LOG_DIR( ${LOG_FILE} "CUDA_INCLUDE" ${CUDA_INCLUDE} )
+   LOG_DIR( ${LOG_FILE} "CUDA_TARGET_LINK" ${CUDA_TARGET_LINK} )
+   LOG_DIR( ${LOG_FILE} "SHARED_TARGET" ${SHARED_TARGET} )
+   LOG_DIR( ${LOG_FILE} "OPENMM_DIR" ${OPENMM_DIR} )
+   LOG_DIR( ${LOG_FILE} "SOURCE_FILES" ${SOURCE_FILES} )
+ENDIF(LOG)
+## ----------------------------------------------------------------------------
+# CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/jama/include)
+CUDA_INCLUDE_DIRECTORIES(${OPENMM_BUILD_FREE_ENERGY_PATH}/platforms/cuda/../src
+                         ${OPENMM_DIR}/platforms/cuda/src
+                         ${OPENMM_DIR}/platforms/cuda/src/kernels
+                         ${OPENMM_DIR}/openmmapi/include ) 
+CUDA_ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
+TARGET_LINK_LIBRARIES(${SHARED_TARGET} debug ${OPENMM_LIBRARY_NAME}_d optimized ${OPENMM_LIBRARY_NAME} )
+SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_BUILDING_SHARED_LIBRARY")
+INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
--- a/plugins/freeEnergy/platforms/cuda/src/CudaFreeEnergyKernelFactory.cpp
+++ b/plugins/freeEnergy/platforms/cuda/src/CudaFreeEnergyKernelFactory.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaFreeEnergyKernelFactory.h"
+#include "CudaFreeEnergyKernels.h"
+#include "openmm/freeEnergyKernels.h"
+#include "openmm/internal/ContextImpl.h"
+#include "openmm/OpenMMException.h"
+#if defined(OPENMM_BUILDING_SHARED_LIBRARY)
+    #if defined(WIN32)
+      #include <windows.h>
+        extern "C" void initOpenMMCudaFreeEnergyPlugin();
+        BOOL WINAPI DllMain(HANDLE hModule, DWORD  ul_reason_for_call, LPVOID lpReserved) {
+            if (ul_reason_for_call == DLL_PROCESS_ATTACH)
+                initOpenMMCudaFreeEnergyPlugin();
+            return TRUE;
+        }   
+    #else
+        extern "C" void __attribute__((constructor)) initOpenMMCudaFreeEnergyPlugin();
+    #endif
+#endif
+using namespace OpenMM;
+extern "C" void initOpenMMCudaFreeEnergyPlugin() {
+    if ( gpuIsAvailable() ){
+        CudaPlatform* cudaPlatform            = new CudaPlatform();
+        CudaFreeEnergyKernelFactory* factory  = new CudaFreeEnergyKernelFactory();
+        cudaPlatform->registerKernelFactory(CalcNonbondedSoftcoreForceKernel::Name(), factory);
+        cudaPlatform->registerKernelFactory(CalcGBSAOBCSoftcoreForceKernel::Name(), factory);
+        cudaPlatform->registerKernelFactory(CalcGBVISoftcoreForceKernel::Name(), factory);
+        Platform::registerPlatform(cudaPlatform);
+    }
+}
+KernelImpl* CudaFreeEnergyKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
+    CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
+    if (name == CalcNonbondedSoftcoreForceKernel::Name())
+        return new CudaFreeEnergyCalcNonbondedSoftcoreForceKernel(name, platform, data, context.getSystem());
+    if (name == CalcGBSAOBCSoftcoreForceKernel::Name())
+        return new CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel(name, platform, data);
+    if (name == CalcGBVISoftcoreForceKernel::Name())
+        return new CudaFreeEnergyCalcGBVISoftcoreForceKernel(name, platform, data);
+    throw OpenMMException( (std::string("Tried to create kernel with illegal kernel name '") + name + "'").c_str() );
+}
--- a/plugins/freeEnergy/platforms/cuda/src/CudaFreeEnergyKernels.cpp
+++ b/plugins/freeEnergy/platforms/cuda/src/CudaFreeEnergyKernels.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaFreeEnergyKernels.h"
+#include "openmm/Context.h"
+#include "openmm/OpenMMException.h"
+#include "openmm/internal/ContextImpl.h"
+#include "kernels/gputypes.h"
+#include "kernels/cudaKernels.h"
+#include "kernels/GpuFreeEnergyCudaKernels.h" 
+#include "kernels/GpuLJ14Softcore.h" 
+#include <cmath>
+using namespace OpenMM;
+#include <map>
+typedef std::map< std::string, int > MapStringInt;
+typedef MapStringInt::iterator MapStringIntI;
+typedef MapStringInt::const_iterator MapStringIntCI;
+// force names
+const std::string HARMONIC_BOND_FORCE             = "HarmonicBond";
+const std::string HARMONIC_ANGLE_FORCE            = "HarmonicBond";
+const std::string PERIODIC_TORSION_FORCE          = "PeriodicTorsion";
+const std::string RB_TORSION_FORCE                = "RbTorsion";
+const std::string NB_FORCE                        = "Nb";
+const std::string NB_SOFTCORE_FORCE               = "NbSoftcore";
+const std::string NB_EXCEPTION_FORCE              = "NbException";
+const std::string NB_EXCEPTION_SOFTCORE_FORCE     = "NbSoftcoreException";
+const std::string GBSA_OBC_FORCE                  = "Obc";
+const std::string GBSA_OBC_SOFTCORE_FORCE         = "ObcSoftcore";
+const std::string GBVI_FORCE                      = "GBVI";
+const std::string GBVI_SOFTCORE_FORCE             = "GBVISoftcore";
+static void getForceMap(const System& system, MapStringInt& forceMap) { 
+    // check forces and relevant parameters
+    for(int i = 0; i < system.getNumForces(); ++i) {
+        int hit                 = 0;
+        const Force& force      = system.getForce(i);
+        // bond
+        if( !hit ){
+            try {
+               const HarmonicBondForce& harmonicBondForce = dynamic_cast<const HarmonicBondForce&>(force);
+               forceMap[HARMONIC_BOND_FORCE]              = 1;
+               hit++;
+            } catch( std::bad_cast ){
+            }
+        }
+        // angle
+        if( !hit ){
+            try {
+               const HarmonicAngleForce& harmonicAngleForce = dynamic_cast<const HarmonicAngleForce&>(force);
+               forceMap[HARMONIC_ANGLE_FORCE]               = 1;
+               hit++;
+            } catch( std::bad_cast ){
+            }
+        }
+        // PeriodicTorsionForce
+        if( !hit ){
+            try {
+               const PeriodicTorsionForce & periodicTorsionForce = dynamic_cast<const PeriodicTorsionForce&>(force);
+               forceMap[PERIODIC_TORSION_FORCE]                  = 1;
+               hit++;
+            } catch( std::bad_cast ){
+            }
+        }
+        // RBTorsionForce
+        if( !hit ){
+            try {
+               const RBTorsionForce& rBTorsionForce = dynamic_cast<const RBTorsionForce&>(force);
+               forceMap[RB_TORSION_FORCE]           = 1;
+               hit++;
+            } catch( std::bad_cast ){
+            }
+        }
+        // nonbonded
+        if( !hit ){
+            try {
+               const NonbondedForce& nbForce = dynamic_cast<const NonbondedForce&>(force);
+               forceMap[NB_FORCE]            = 1;
+            } catch( std::bad_cast ){
+            }
+        }
+        // nonbonded softcore
+        if( !hit ){
+            try {
+               const NonbondedSoftcoreForce& nbForce = dynamic_cast<const NonbondedSoftcoreForce&>(force);
+               forceMap[NB_SOFTCORE_FORCE]           = 1;
+            } catch( std::bad_cast ){
+            }
+        }
+        // GBSA OBC
+        if( !hit ){
+            try {
+               const GBSAOBCForce& obcForce       = dynamic_cast<const GBSAOBCForce&>(force);
+               forceMap[GBSA_OBC_FORCE]           = 1;
+               hit++;
+            } catch( std::bad_cast ){
+            }
+        }
+        // GBSA OBC softcore
+        if( !hit ){
+            try {
+               const GBSAOBCSoftcoreForce& obcForce = dynamic_cast<const GBSAOBCSoftcoreForce&>(force);
+               forceMap[GBSA_OBC_SOFTCORE_FORCE]    = 1;
+               hit++;
+            } catch( std::bad_cast ){
+            }
+        }
+        // GB/VI
+        if( !hit ){
+            try {
+               const GBVIForce& obcForce  = dynamic_cast<const GBVIForce&>(force);
+               forceMap[GBVI_FORCE] = 1;
+               hit++;
+            } catch( std::bad_cast ){
+            }
+        }
+        // GB/VI softcore
+        if( !hit ){
+            try {
+               const GBVISoftcoreForce& gbviForce = dynamic_cast<const GBVISoftcoreForce&>(force);
+               forceMap[GBVI_SOFTCORE_FORCE]      = 1;
+               hit++;
+            } catch( std::bad_cast ){
+            }
+        }
+     }
+}
+CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::~CudaFreeEnergyCalcNonbondedSoftcoreForceKernel() {
+    if( log ){
+        (void) fprintf( log, "CudaFreeEnergyCalcNonbondedSoftcoreForceKernel destructor called.\n" );
+        (void) fflush( log );
+    }
+    delete gpuNonbondedSoftcore;
+    delete gpuLJ14Softcore;
+}
+void CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::initialize(const System& system, const NonbondedSoftcoreForce& force) {
+// ---------------------------------------------------------------------------------------
+   static const std::string methodName      = "CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::initialize";
+// ---------------------------------------------------------------------------------------
+    if( log ){
+        (void) fprintf( log, "%s called.\n", methodName.c_str() );
+        (void) fflush( log );
+    }
+    // check forces and relevant parameters
+    MapStringInt forceMap;
+    getForceMap( system, forceMap);
+    int softcore        = 0;
+    if( forceMap.find( GBSA_OBC_FORCE ) != forceMap.end() ){
+       setIncludeGBSA( true );
+    }
+    if( forceMap.find( GBSA_OBC_SOFTCORE_FORCE ) != forceMap.end() ){
+       setIncludeGBSA( true );
+       softcore++;
+    }
+    if( forceMap.find( GBVI_FORCE ) != forceMap.end() ){
+       setIncludeGBVI( true );
+    }
+    if( forceMap.find( GBVI_SOFTCORE_FORCE ) != forceMap.end() ){
+       setIncludeGBVI( true );
+       softcore++;
+    }
+    if( forceMap.find( NB_SOFTCORE_FORCE ) != forceMap.end() ){
+       softcore++;
+    }
+    setIncludeSoftcore( softcore );
+    numParticles      = force.getNumParticles();
+    _gpuContext* gpu  = data.gpu;
+    // Identify which exceptions are 1-4 interactions.
+    std::vector<pair<int, int> > exclusions;
+    std::vector<int> exceptions;
+    for (int i = 0; i < force.getNumExceptions(); i++) {
+        int particle1, particle2;
+        double chargeProd, sigma, epsilon, softcoreLJLambda;
+        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon, softcoreLJLambda);
+        exclusions.push_back(pair<int, int>(particle1, particle2));
+        if (chargeProd != 0.0 || epsilon != 0.0)
+            exceptions.push_back(i);
+    }
+    // Initialize nonbonded interactions.
+    {
+        std::vector<int> particle(numParticles);
+        std::vector<float> c6(numParticles);
+        std::vector<float> c12(numParticles);
+        std::vector<float> q(numParticles);
+        std::vector<float> softcoreLJLambdaArray(numParticles);
+        std::vector<char> symbol;
+        std::vector<std::vector<int> > exclusionList(numParticles);
+        float minSoftcoreLJLambda = 1.0e+20f;
+        for (int i = 0; i < numParticles; i++) {
+            double charge, radius, depth, softcoreLJLambda;
+            force.getParticleParameters(i, charge, radius, depth, softcoreLJLambda);
+            particle[i]              = i;
+            q[i]                     = static_cast<float>( charge );
+            c6[i]                    = static_cast<float>( (4*depth*pow(radius, 6.0)) );
+            c12[i]                   = static_cast<float>( (4*depth*pow(radius, 12.0)) );
+            softcoreLJLambdaArray[i] = static_cast<float>( softcoreLJLambda );
+            if( minSoftcoreLJLambda > softcoreLJLambda ){
+               minSoftcoreLJLambda = softcoreLJLambda;
+            }
+            exclusionList[i].push_back(i);
+        }
+        for (int i = 0; i < (int)exclusions.size(); i++) {
+            exclusionList[exclusions[i].first].push_back(exclusions[i].second);
+            exclusionList[exclusions[i].second].push_back(exclusions[i].first);
+        }
+        Vec3 boxVectors[3];
+        system.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+        gpuSetPeriodicBoxSize(gpu, static_cast<float>(boxVectors[0][0] ), static_cast<float>(boxVectors[1][1] ), static_cast<float>(boxVectors[2][2] ));
+        CudaNonbondedMethod method = NO_CUTOFF;
+        if (force.getNonbondedMethod() != NonbondedSoftcoreForce::NoCutoff) {
+            gpuSetNonbondedCutoff(gpu, static_cast<float>(force.getCutoffDistance() ), force.getReactionFieldDielectric());
+            method = CUTOFF;
+        }
+        if (force.getNonbondedMethod() == NonbondedSoftcoreForce::CutoffPeriodic) {
+            method = PERIODIC;
+        }
+        if (force.getNonbondedMethod() == NonbondedSoftcoreForce::Ewald || force.getNonbondedMethod() == NonbondedSoftcoreForce::PME) {
+            double ewaldErrorTol = force.getEwaldErrorTolerance();
+            double alpha = (1.0/force.getCutoffDistance())*std::sqrt(-std::log(ewaldErrorTol));
+            double mx = boxVectors[0][0]/force.getCutoffDistance();
+            double my = boxVectors[1][1]/force.getCutoffDistance();
+            double mz = boxVectors[2][2]/force.getCutoffDistance();
+            double pi = 3.1415926535897932385;
+            int kmaxx = (int)std::ceil(-(mx/pi)*std::log(ewaldErrorTol));
+            int kmaxy = (int)std::ceil(-(my/pi)*std::log(ewaldErrorTol));
+            int kmaxz = (int)std::ceil(-(mz/pi)*std::log(ewaldErrorTol));
+            if (force.getNonbondedMethod() == NonbondedSoftcoreForce::Ewald) {
+                if (kmaxx%2 == 0)
+                    kmaxx++;
+                if (kmaxy%2 == 0)
+                    kmaxy++;
+                if (kmaxz%2 == 0)
+                    kmaxz++;
+                gpuSetEwaldParameters(gpu, static_cast<float>( alpha ), kmaxx, kmaxy, kmaxz);
+                method = EWALD;
+            }
+            else {
+                int gridSizeX = kmaxx*3;
+                int gridSizeY = kmaxy*3;
+                int gridSizeZ = kmaxz*3;
+                gridSizeX = ((gridSizeX+3)/4)*4;
+                gridSizeY = ((gridSizeY+3)/4)*4;
+                gridSizeZ = ((gridSizeZ+3)/4)*4;
+                gpuSetPMEParameters(gpu, static_cast<float>( alpha ), gridSizeX, gridSizeY, gridSizeZ);
+                method = PARTICLE_MESH_EWALD;
+            }
+        }
+        data.nonbondedMethod = method;
+        // setup parameters
+        gpuNonbondedSoftcore = gpuSetNonbondedSoftcoreParameters(gpu, 138.935485f, particle, c6, c12, q,
+                                                                 softcoreLJLambdaArray, symbol, exclusionList, method);
+        // Compute the Ewald self energy.
+        data.ewaldSelfEnergy = 0.0;
+        if (force.getNonbondedMethod() == NonbondedSoftcoreForce::Ewald || force.getNonbondedMethod() == NonbondedSoftcoreForce::PME) {
+            double selfEnergyScale = gpu->sim.epsfac*gpu->sim.alphaEwald/std::sqrt(PI);
+                for (int i = 0; i < numParticles; i++)
+                    data.ewaldSelfEnergy -= selfEnergyScale*q[i]*q[i];
+        }
+    }
+    // Initialize 1-4 nonbonded interactions.
+    {
+        numExceptions = exceptions.size();
+        std::vector<int> particle1(numExceptions);
+        std::vector<int> particle2(numExceptions);
+        std::vector<float> c6(numExceptions);
+        std::vector<float> c12(numExceptions);
+        std::vector<float> q1(numExceptions);
+        std::vector<float> q2(numExceptions);
+        std::vector<float> softcoreLJLambdaArray(numExceptions);
+        for (int i = 0; i < numExceptions; i++) {
+            double charge, sig, eps, softcoreLJLambda;
+            force.getExceptionParameters(exceptions[i], particle1[i], particle2[i], charge, sig, eps, softcoreLJLambda);
+            c6[i]                    = static_cast<float>( (4*eps*pow(sig, 6.0)) );
+            c12[i]                   = static_cast<float>( (4*eps*pow(sig, 12.0)) );
+            q1[i]                    = static_cast<float>( charge );
+            q2[i]                    = 1.0f;
+            softcoreLJLambdaArray[i] = static_cast<float>( softcoreLJLambda );
+        }
+        gpuLJ14Softcore = gpuSetLJ14SoftcoreParameters(gpu, 138.935485f, 1.0f, particle1, particle2, c6, c12, q1, q2, softcoreLJLambdaArray);
+    }
+}
+void CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::executeForces(ContextImpl& context) {
+// ---------------------------------------------------------------------------------------
+   static const std::string methodName      = "CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::executeForces";
+// ---------------------------------------------------------------------------------------
+    _gpuContext* gpu = data.gpu;
+    // write array, ... address's to board
+    if( setSim == 0 ){
+        setSim++;
+        if( log ){
+            (void) fprintf( log, "%s Obc=%d GB/VI=%d exceptions=%d\n",
+                            methodName.c_str(), getIncludeGBSA(), getIncludeGBVI(), getNumExceptions() );
+            (void) fflush( log );
+        }
+        SetCalculateLocalSoftcoreGpuSim( gpu );
+        SetCalculateCDLJSoftcoreGpuSim( gpu );
+// (void) fprintf( log, "Calling SetCalculateLocalSoftcoreGpuSim\n" ); fflush( stderr );
+        // flip strides (unsure if this is needed)
+#if 0
+        (void) fprintf( stderr, "flipping gpuLJ14Softcore\n" ); fflush( stderr );
+        GpuLJ14Softcore* gpuLJ14Softcore = getGpuLJ14Softcore( );
+        if( gpuLJ14Softcore ){
+            gpuLJ14Softcore->flipStrides( gpu );
+            if( log ){
+                (void) fprintf( log, "CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::executeForces flipping LJ14\n" );
+                (void) fflush( log );
+            }
+        }
+#endif
+    }
+    // calculate nonbonded ixns here, only if implicit solvent is inactive
+    if ( !getIncludeGBSA() && !getIncludeGBVI() ) {
+//kClearForces(gpu);
+        kCalculateCDLJSoftcoreForces(gpu);
+    }
+    // local LJ-14 forces
+    kCalculateLocalSoftcoreForces(gpu);
+//kReduceForces(gpu);
+//exit(0);
+}
+double CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::executeEnergy(ContextImpl& context) {
+    executeForces(context);
+    return 0.0;
+}
+bool CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::getIncludeGBSA( void ) const {
+    return bIncludeGBSA;
+}
+void CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::setIncludeGBSA( bool inputIncludeGBSA ){
+    bIncludeGBSA = inputIncludeGBSA;
+}
+bool CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::getIncludeGBVI( void ) const {
+    return bIncludeGBVI;
+}
+void CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::setIncludeGBVI( bool inputIncludeGBVI ){
+    bIncludeGBVI = inputIncludeGBVI;
+}
+bool CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::getIncludeSoftcore( void ) const {
+    return bIncludeSoftcore;
+}
+int CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::getNumExceptions( void ) const {
+    return numExceptions;
+}
+void CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::setIncludeSoftcore( bool inputIncludeSoftcore ){
+    bIncludeSoftcore = inputIncludeSoftcore;
+}
+GpuLJ14Softcore* CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::getGpuLJ14Softcore( void ) const {
+    return gpuLJ14Softcore;
+}
+void CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::setGpuLJ14Softcore( GpuLJ14Softcore* inputGpuLJ14Softcore ){
+    gpuLJ14Softcore = inputGpuLJ14Softcore;
+}
+CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::~CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel() {
+    delete gpuObcGbsaSoftcore;
+}
+void CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::initialize(const System& system, const GBSAOBCSoftcoreForce& force) {
+// ---------------------------------------------------------------------------------------
+   //static const std::string methodName      = "CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::initialize";
+// ---------------------------------------------------------------------------------------
+    _gpuContext* gpu = data.gpu;
+    MapStringInt forceMap;
+    getForceMap( system, forceMap);
+    // check that nonbonded (non-softcore is not active)
+    if( forceMap.find( NB_FORCE ) != forceMap.end() ){ 
+        throw OpenMMException( "Mixing NonbondedForce and GBSAOBCSoftoreForce not allowed -- use NonbondedSoftcoreForce " );
+    }
+    int numParticles = system.getNumParticles();
+    std::vector<float> radius(numParticles);
+    std::vector<float> scale(numParticles);
+    std::vector<float> charge(numParticles);
+    std::vector<float> nonPolarScalingFactors(numParticles);
+    for (int i = 0; i < numParticles; i++) {
+        double particleCharge, particleRadius, scalingFactor, particleNonPolarScalingFactor;
+        force.getParticleParameters(i, particleCharge, particleRadius, scalingFactor, particleNonPolarScalingFactor);
+        radius[i]                 = static_cast<float>( particleRadius);
+        scale[i]                  = static_cast<float>( scalingFactor);
+        charge[i]                 = static_cast<float>( particleCharge);
+        nonPolarScalingFactors[i] = static_cast<float>( particleNonPolarScalingFactor);
+    }
+    gpuObcGbsaSoftcore = gpuSetObcSoftcoreParameters(gpu, static_cast<float>( force.getSoluteDielectric()),
+                                                     static_cast<float>( force.getSolventDielectric()),
+                                                     radius, scale, charge, nonPolarScalingFactors  );
+}
+void CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::executeForces(ContextImpl& context) {
+// ---------------------------------------------------------------------------------------
+   static const std::string methodName      = "CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::executeForces";
+// ---------------------------------------------------------------------------------------
+    _gpuContext* gpu = data.gpu;
+    int debug        = 1;
+    // send address's of arrays, ... to device on first call
+    // required since force/energy buffers not set when CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::initialize() was called
+    if( setSim == 0 ){
+       setSim++;
+       SetCalculateObcGbsaSoftcoreBornSumSim( gpu );
+       SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( gpu );
+       //SetCalculateObcGbsaForces2Sim( gpu );
+       SetCalculateObcGbsaSoftcoreForces2Sim( gpu );
+    }
+    // required!!
+    gpu->bRecalculateBornRadii = true;
+    // calculate Born radii and first loop of Obc forces
+    if( debug && log ){
+        (void) fprintf( stderr, "\n%s: calling kCalculateCDLJObcGbsaSoftcoreForces1\n", methodName.c_str() );
+        (void) fflush( stderr );
+    }
+    kClearBornForces(gpu);
+    kCalculateObcGbsaSoftcoreBornSum(gpu);
+    kReduceObcGbsaBornSum(gpu);
+    kCalculateCDLJObcGbsaSoftcoreForces1(gpu);
+    if( debug && log ){
+        (void) fprintf( stderr, "\n%s: calling kReduceObcGbsaBornForces\n", methodName.c_str()  );
+        (void) fflush( stderr );
+    }
+    // compute Born forces
+    gpu->bIncludeGBSA = true;
+    kReduceObcGbsaBornForces(gpu);
+    gpu->bIncludeGBSA = false;
+    if( debug && log ){
+        (void) fprintf( stderr, "\n%s calling kCalculateObcGbsaForces2\n", methodName.c_str() );
+        (void) fflush( stderr );
+    }
+    // second loop of Obc GBSA forces
+    //kCalculateObcGbsaForces2(gpu);
+    kCalculateObcGbsaSoftcoreForces2(gpu);
+}
+double CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::executeEnergy(ContextImpl& context) {
+    executeForces( context );
+	 return 0.0;
+}
+CudaFreeEnergyCalcGBVISoftcoreForceKernel::~CudaFreeEnergyCalcGBVISoftcoreForceKernel() {
+    if( log ){
+        (void) fprintf( log, "CudaFreeEnergyCalcGBVISoftcoreForceKernel destructor called -- freeing gpuGBVISoftcore.\n" );
+        (void) fflush( log );
+    }
+    delete gpuGBVISoftcore;
+}
+void CudaFreeEnergyCalcGBVISoftcoreForceKernel::initialize(const System& system, const GBVISoftcoreForce& force, const std::vector<double> & inputScaledRadii) {
+// ---------------------------------------------------------------------------------------
+   //static const std::string methodName      = "CudaFreeEnergyCalcGBVISoftcoreForceKernel::initialize";
+// ---------------------------------------------------------------------------------------
+    int numParticles = system.getNumParticles();
+    _gpuContext* gpu = data.gpu;
+    // check forces and relevant parameters
+    MapStringInt forceMap;
+    getForceMap( system, forceMap);
+    // check that nonbonded (non-softcore is not active)
+    if( forceMap.find( NB_FORCE ) != forceMap.end() ){ 
+        throw OpenMMException( "Mixing NonbondedForce and GBVISoftoreForce not allowed -- use NonbondedSoftcoreForce " );
+    }
+    std::vector<int>   particle(numParticles);
+    std::vector<float> radius(numParticles);
+    std::vector<float> scaledRadii(numParticles);
+    std::vector<float> gammas(numParticles);
+    std::vector<float> bornRadiusScaleFactors(numParticles);
+    for (int i = 0; i < numParticles; i++) {
+        double charge, particleRadius, gamma, bornRadiusScaleFactor;
+        force.getParticleParameters(i, charge, particleRadius, gamma, bornRadiusScaleFactor);
+        particle[i]                  = i;
+        radius[i]                    = static_cast<float>( particleRadius );
+        gammas[i]                    = static_cast<float>( gamma );
+        scaledRadii[i]               = static_cast<float>( inputScaledRadii[i] );
+        bornRadiusScaleFactors[i]    = static_cast<float>( bornRadiusScaleFactor );
+    }
+    // tanh not implemented
+//    std::vector<float> tanhScaleFactors;
+    std::vector<float> quinticSplineParameters;
+    if( force.getBornRadiusScalingMethod() == GBVISoftcoreForce::Tanh ){
+/*
+        double alpha, beta, gamma;
+        force.getTanhParameters( alpha, beta, gamma );
+        tanhScaleFactors.resize( 3 );
+        tanhScaleFactors[0] = static_cast<float>(alpha);
+        tanhScaleFactors[1] = static_cast<float>(beta);
+        tanhScaleFactors[2] = static_cast<float>(gamma);
+*/
+    } else if( force.getBornRadiusScalingMethod() == GBVISoftcoreForce::QuinticSpline ){
+        // quintic spline
+        quinticSplineParameters.resize(2);
+        quinticSplineParameters[0] = static_cast<float>(force.getQuinticLowerLimitFactor());
+        quinticSplineParameters[1] = static_cast<float>(force.getQuinticUpperBornRadiusLimit());
+        quinticSplineParameters[1] = powf( quinticSplineParameters[1], -3.0f ); 
+        setQuinticScaling( 1 );
+    }
+    // load parameters onto board
+    // defined in kCalculateGBVISoftcore.cu
+    gpuGBVISoftcore = gpuSetGBVISoftcoreParameters(gpu, static_cast<float>( force.getSoluteDielectric() ), static_cast<float>( force.getSolventDielectric() ),
+                                                    particle, radius, gammas, scaledRadii, bornRadiusScaleFactors, quinticSplineParameters);
+}
+void CudaFreeEnergyCalcGBVISoftcoreForceKernel::executeForces(ContextImpl& context) {
+// ---------------------------------------------------------------------------------------
+   static const std::string methodName      = "CudaFreeEnergyCalcGBVISoftcoreForceKernel::executeForces";
+// ---------------------------------------------------------------------------------------
+    _gpuContext* gpu = data.gpu;
+    int debug        = 1;
+    // send address's of arrays, ... to device on first call
+    // required since force/energy buffers not set when CudaFreeEnergyCalcGBVISoftcoreForceKernel::initialize() was called
+    if( setSim == 0 ){
+       setSim++;
+       SetCalculateGBVISoftcoreBornSumGpuSim( gpu );
+       SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( gpu );
+       SetCalculateGBVIForces2Sim( gpu );
+    }
+    // required!!
+    gpu->bRecalculateBornRadii = true; // fixed
+//kClearForces(gpu);
+    // calculate Born radii and first loop of GB/VI forces
+    if( debug && log ){
+        (void) fprintf( stderr, "\n%s: calling kCalculateCDLJObcGbsaSoftcoreForces1 & %s\n", methodName.c_str(),
+                        getQuinticScaling() ? "kReduceGBVIBornSumQuinticScaling" : "kReduceGBVIBornSum" );
+        (void) fflush( stderr );
+    }
+    kClearBornForces(gpu);
+    kCalculateGBVISoftcoreBornSum(gpu);
+    if( getQuinticScaling() ){
+//(void) fprintf( stderr, "\n%s: calling kReduceGBVIBornSumQuinticScaling\n", methodName.c_str() ); fflush( stderr );
+        kReduceGBVIBornSumQuinticScaling(gpu, gpuGBVISoftcore );
+    } else {
+        kReduceGBVIBornSum(gpu);
+    }
+    kCalculateCDLJObcGbsaSoftcoreForces1(gpu);
+    if( debug && log ){
+        (void) fprintf( stderr, "\n%s: calling %s\n", methodName.c_str(),
+                        getQuinticScaling() ? "kReduceGBVIBornForcesQuinticScaling" : "kReduceObcGbsaBornForces" );
+        (void) fflush( stderr );
+    }
+    // compute Born forces
+    if( getQuinticScaling() ){
+        kReduceGBVIBornForcesQuinticScaling(gpu);
+    } else {
+        gpu->bIncludeGBVI = true;
+        kReduceObcGbsaBornForces(gpu);
+        gpu->bIncludeGBVI = false;
+    }
+    if( debug && log ){
+        (void) fprintf( stderr, "\n%s: calling kCalculateGBVIForces2\n", methodName.c_str() );
+        (void) fflush( stderr );
+    }
+    // second loop of GB/VI forces
+    kCalculateGBVIForces2(gpu);
+//kReduceForces(gpu);
+}
+double CudaFreeEnergyCalcGBVISoftcoreForceKernel::executeEnergy(ContextImpl& context) {
+    executeForces( context );
+    return 0.0;
+}
+int CudaFreeEnergyCalcGBVISoftcoreForceKernel::getQuinticScaling( void ) const {
+// ---------------------------------------------------------------------------------------
+   //static const std::string methodName      = "CudaFreeEnergyCalcGBVISoftcoreForceKernel::getQuinticScaling";
+// ---------------------------------------------------------------------------------------
+    return quinticScaling;
+}
+void CudaFreeEnergyCalcGBVISoftcoreForceKernel::setQuinticScaling( int inputQuinticScaling) {
+// ---------------------------------------------------------------------------------------
+   //static const std::string methodName      = "CudaFreeEnergyCalcGBVISoftcoreForceKernel::setQuinticScaling";
+// ---------------------------------------------------------------------------------------
+    quinticScaling = inputQuinticScaling;
+}
--- a/plugins/freeEnergy/platforms/cuda/src/CudaFreeEnergyKernels.h
+++ b/plugins/freeEnergy/platforms/cuda/src/CudaFreeEnergyKernels.h
+#ifndef OPENMM_FREE_ENERGY_CUDA_KERNELS_H_
+#define OPENMM_FREE_ENERGY_CUDA_KERNELS_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "CudaPlatform.h"
+#include "openmm/kernels.h"
+#include "kernels/gputypes.h"
+#include "openmm/System.h"
+#include "OpenMMFreeEnergy.h"
+#include "openmm/freeEnergyKernels.h"
+#include "kernels/GpuNonbondedSoftcore.h"
+#include "kernels/GpuLJ14Softcore.h"
+#include "kernels/GpuObcGbsaSoftcore.h"
+#include "kernels/GpuGBVISoftcore.h"
+//#define FreeEnergyDebug
+namespace OpenMM {
+/**
+ * This kernel is invoked by NonbondedSoftcoreForce to calculate the forces acting on the system.
+ */
+class CudaFreeEnergyCalcNonbondedSoftcoreForceKernel : public CalcNonbondedSoftcoreForceKernel {
+public:
+    CudaFreeEnergyCalcNonbondedSoftcoreForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) :
+             CalcNonbondedSoftcoreForceKernel(name, platform), data(data), system(system) {
+        gpuNonbondedSoftcore = NULL;
+        gpuLJ14Softcore      = NULL;
+#ifdef FreeEnergyDebug
+        log                  = stderr;
+#else
+        log                  = NULL;
+#endif
+        setSim               = 0;
+        numExceptions        = 0;
+        numParticles         = 0;
+        bIncludeGBSA         = false;
+        bIncludeGBVI         = false;
+        bIncludeSoftcore     = false;
+    }
+    ~CudaFreeEnergyCalcNonbondedSoftcoreForceKernel();
+    /**
+     * Initialize the kernel.
+     * 
+     * @param system     the System this kernel will be applied to
+     * @param force      the NonbondedForce this kernel will be used for
+     */
+    void initialize(const System& system, const NonbondedSoftcoreForce& force);
+    /**
+     * Execute the kernel to calculate the forces.
+     * 
+     * @param context    the context in which to execute this kernel
+     */
+    void executeForces(ContextImpl& context);
+    /**
+     * Execute the kernel to calculate the energy.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @return the potential energy due to the NonbondedForce
+     */
+    double executeEnergy(ContextImpl& context);
+    /**
+     * Get flag signalling whether GBSA/OBC force is included
+     *
+     * @return flag
+     */
+    bool getIncludeGBSA( void ) const;
+    /**
+     * Set flag signalling whether GBSA/OBC force is included
+     *
+     * @param inputIncludeGBSA input flag value
+     */
+    void setIncludeGBSA( bool inputIncludeGBSA );
+    /**
+     * Get flag signalling whether GB/VI force is included
+     *
+     * @return flag
+     */
+    bool getIncludeGBVI( void ) const;
+    /**
+     * Set flag signalling whether GB/VI force is included
+     *
+     * @param inputIncludeGBVI input flag value
+     */
+    void setIncludeGBVI( bool inputIncludeGBVI );
+    /**
+     * Get flag signalling whether softcore force is included
+     *
+     * @return flag
+     */
+    bool getIncludeSoftcore( void ) const;
+    /**
+     * Set flag signalling whether GB/VI force is included
+     *
+     * @param inputIncludeGBVI input flag value
+     */
+    void setIncludeSoftcore( bool inputSoftcore);
+    /**
+     * Get number of exceptions
+     *
+     * @return number of exceptions
+     */
+    int getNumExceptions( void ) const;
+    /**
+     * Get GpuLJ14Softcore
+     *
+     * @return GpuLJ14Softcore object
+     */
+    GpuLJ14Softcore* getGpuLJ14Softcore( void ) const;
+    /**
+     * Set GpuLJ14Softcore
+     *
+     * @param GpuLJ14Softcore object
+     */
+    void setGpuLJ14Softcore( GpuLJ14Softcore* gpuLJ14Softcore );
+private:
+    CudaPlatform::PlatformData& data;
+    int numParticles;
+    System& system;
+    GpuNonbondedSoftcore* gpuNonbondedSoftcore;
+    GpuLJ14Softcore* gpuLJ14Softcore;
+    bool bIncludeGBSA;
+    bool bIncludeGBVI;
+    bool bIncludeSoftcore;
+    int numExceptions;
+    FILE* log;
+    int setSim;
+};
+/**
+ * This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
+ */
+class CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel : public CalcGBSAOBCSoftcoreForceKernel {
+public:
+    CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) :
+       CalcGBSAOBCSoftcoreForceKernel(name, platform), data(data) {
+#ifdef FreeEnergyDebug
+        log                  = stderr;
+#else
+        log                  = NULL;
+#endif
+        setSim               = 0;
+        gpuObcGbsaSoftcore   = NULL;
+    }
+    ~CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel();
+    /**
+     * Initialize the kernel.
+     * 
+     * @param system     the System this kernel will be applied to
+     * @param force      the GBSAOBCForce this kernel will be used for
+     */
+    void initialize(const System& system, const GBSAOBCSoftcoreForce& force);
+    /**
+     * Execute the kernel to calculate the forces.
+     * 
+     * @param context    the context in which to execute this kernel
+     */
+    void executeForces(ContextImpl& context);
+    /**
+     * Execute the kernel to calculate the energy.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @return the potential energy due to the GBSAOBCForce
+     */
+    double executeEnergy(ContextImpl& context);
+private:
+    CudaPlatform::PlatformData& data;
+    FILE* log;
+    int setSim;
+    GpuObcGbsaSoftcore* gpuObcGbsaSoftcore;
+};
+/**
+ * This kernel is invoked by GBVIForce to calculate the forces acting on the system.
+ */
+class CudaFreeEnergyCalcGBVISoftcoreForceKernel : public CalcGBVISoftcoreForceKernel {
+public:
+    CudaFreeEnergyCalcGBVISoftcoreForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) :
+         CalcGBVISoftcoreForceKernel(name, platform), data(data) {
+#ifdef FreeEnergyDebug
+        log                  = stderr;
+#else
+        log                  = NULL;
+#endif
+        setSim               = 0;
+        quinticScaling       = 0;
+        gpuGBVISoftcore      = NULL;
+    }
+    ~CudaFreeEnergyCalcGBVISoftcoreForceKernel();
+    /**
+     * Initialize the kernel.
+     * 
+     * @param system      the System this kernel will be applied to
+     * @param force       the GBVIForce this kernel will be used for
+     * @param scaledRadii the scaled radii (Eq. 5 of Labute paper)
+     */
+    void initialize(const System& system, const GBVISoftcoreForce& force, const std::vector<double> & scaledRadii);
+    /**
+     * Execute the kernel to calculate the forces.
+     * 
+     * @param context    the context in which to execute this kernel
+     */
+    void executeForces(ContextImpl& context);
+    /**
+     * Execute the kernel to calculate the energy.
+     * 
+     * @param context    the context in which to execute this kernel
+     * @return the potential energy due to the GBVIForce
+     */
+    double executeEnergy(ContextImpl& context);
+    /**
+     * Apply quintic scaling for Born radii
+     * 
+     * @return nonzero value if scaling is to be applied
+     */
+    int getQuinticScaling(void) const;
+    /**
+     * Set flag for quintic scaling for Born radii
+     * 
+     * @param nonzero value if scaling is to be applied
+     */
+    void setQuinticScaling(int quinticScaling );
+private:
+    CudaPlatform::PlatformData& data;
+    GpuGBVISoftcore* gpuGBVISoftcore;
+    FILE* log;
+    int setSim;
+    int quinticScaling;
+};
+} // namespace OpenMM
+#endif /*OPENMM_FREE_ENERGY_CUDA_KERNELS_H_*/
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/GpuFreeEnergyCudaKernels.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/GpuFreeEnergyCudaKernels.h
+#ifndef __GPU_FREE_ENERGY_KERNELS_H__
+#define __GPU_FREE_ENERGY_KERNELS_H__
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "gputypes.h"
+#include "cudatypes.h"
+#include "GpuNonbondedSoftcore.h"
+#include "GpuLJ14Softcore.h"
+#include "GpuObcGbsaSoftcore.h"
+#include "GpuGBVISoftcore.h"
+#include <vector>
+// Function prototypes
+// CDLJ softcore
+// setup methods called from CudaFreeEnergyKernels
+// nonbonded and 1-4 ixns
+extern "C" 
+GpuNonbondedSoftcore* gpuSetNonbondedSoftcoreParameters(gpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6,
+                                                        const std::vector<float>& c12, const std::vector<float>& q,
+                                                        const std::vector<float>& softcoreLJLambdaArray, const std::vector<char>& symbol,
+                                                        const std::vector<std::vector<int> >& exclusions, CudaNonbondedMethod method);
+extern "C" 
+GpuLJ14Softcore* gpuSetLJ14SoftcoreParameters(gpuContext gpu, float epsfac, float fudge, const std::vector<int>& atom1,
+                                              const std::vector<int>& atom2, const std::vector<float>& c6, const std::vector<float>& c12,
+                                              const std::vector<float>& q1, const std::vector<float>& q2, const std::vector<float>& softcoreLJLambdaArray);
+// delete supplemtentary objects, ...
+extern "C"
+void gpuDeleteNonbondedSoftcoreParameters( void* gpuNonbondedSoftcore);
+// write address's to device
+extern "C"
+void SetCalculateCDLJSoftcoreGpuSim( gpuContext gpu );
+extern "C"
+void SetCalculateCDLJSoftcoreSupplementarySim( float* gpuParticleSoftCoreLJLambda);
+extern "C"
+void SetCalculateLocalSoftcoreGpuSim( gpuContext gpu );
+// kernel calls to device
+extern "C"
+void kCalculateCDLJSoftcoreForces(gpuContext gpu );
+extern void kCalculateLocalSoftcoreForces( gpuContext gpu );
+// GB/VI softcore
+// setup method called from CudaFreeEnergyKernels
+extern "C" 
+GpuGBVISoftcore* gpuSetGBVISoftcoreParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius, 
+                                              const std::vector<float>& gamma, const std::vector<float>& scaledRadii,
+                                              const std::vector<float>& bornRadiusScaleFactors, const std::vector<float>& quinticSplineParameters);
+// delete supplemtentary objects, ...
+extern "C"
+void gpuDeleteGBVISoftcoreParameters( void* gpuNonbondedSoftcore);
+// write address's to device
+extern "C"
+void SetCalculateGBVISoftcoreForcesSim( gpuContext gpu, float* softCoreLJLambda);
+extern "C"
+void SetCalculateGBVISoftcoreBornSumGpuSim( gpuContext gpu);
+extern "C"
+void SetCalculateGBVISoftcoreSupplementarySim( GpuGBVISoftcore* gpuGBVISoftcore );
+// kernel calls to device
+extern void kReduceGBVIBornSumQuinticScaling( gpuContext gpu, GpuGBVISoftcore* gpuGBVISoftcore );
+extern void kCalculateGBVISoftcoreBornSum( gpuContext gpu );
+extern void kReduceGBVIBornForcesQuinticScaling( gpuContext gpu );
+// Obc softcore
+// setup method called from CudaFreeEnergyKernels
+/** 
+ * Initialize parameters for Cuda Obc softcore
+ * 
+ * @param gpu                  gpu context
+ * @param innerDielectric      solute dielectric
+ * @param solventDielectric    solvent dielectric
+ * @param radius               intrinsic Born radii
+ * @param scale                Obc scaling factors
+ * @param charge               atomic charges (possibly overwritten by other methods?)
+ * @param nonPolarScalingFactors non-polar scaling factors
+ *
+ */
+extern "C" 
+GpuObcGbsaSoftcore* gpuSetObcSoftcoreParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<float>& radius, const std::vector<float>& scale,
+                                                const std::vector<float>& charge, const std::vector<float>& nonPolarScalingFactors);
+// delete supplemtentary objects, ...
+extern "C"
+void gpuDeleteObcSoftcoreParameters( void* gpuNonbondedSoftcore);
+// write address's to device
+extern "C"
+void SetCalculateObcGbsaSoftcoreBornSumSim( gpuContext gpu );
+extern "C"
+void SetCalculateObcGbsaSoftcoreNonPolarScalingFactorsSim( float* nonPolarScalingFactors );
+// this method and kCalculateObcGbsaSoftcoreForces2() are being
+// used until changes in OpenMM version are made 
+extern "C"
+void SetCalculateObcGbsaSoftcoreForces2Sim( gpuContext gpu );
+// kernel calls to device
+extern void kCalculateObcGbsaSoftcoreBornSum( gpuContext gpu );
+// this method is not needed; the OpenMM version can be used
+extern void kCalculateObcGbsaSoftcoreForces2( gpuContext gpu );
+// shared
+extern "C"
+void SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( gpuContext gpu );
+extern "C"
+void SetCalculateCDLJObcGbsaSoftcoreSupplementary1Sim( float* gpuParticleSoftCoreLJLambda);
+extern void kCalculateCDLJObcGbsaSoftcoreForces1( gpuContext gpu );
+#endif //__GPU_FREE_ENERGY_KERNELS_H__
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/GpuGBVISoftcore.cpp
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/GpuGBVISoftcore.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+using namespace std;
+#include "GpuGBVISoftcore.h"
+#include "GpuFreeEnergyCudaKernels.h"
+// GpuGBVISoftcore constructor
+GpuGBVISoftcore::GpuGBVISoftcore( ){
+    _bornRadiiScalingMethod   = 0;
+    _quinticLowerLimitFactor  = 0.8;
+    _quinticUpperLimit        = 0.008;
+    _psSwitchDerivative       = NULL;
+}
+// GpuGBVISoftcore destructor
+GpuGBVISoftcore::~GpuGBVISoftcore( ){
+    delete _psSwitchDerivative;
+}
+// set quintic lower limit factor value
+int GpuGBVISoftcore::setQuinticLowerLimitFactor( float inputQuinticLowerLimitFactor ){
+    _quinticLowerLimitFactor = inputQuinticLowerLimitFactor;
+    return 0;
+}
+// get quintic lower limit factor value
+float GpuGBVISoftcore::getQuinticLowerLimitFactor( void ) const {
+    return _quinticLowerLimitFactor;
+}
+// set quintic upper limit value
+int GpuGBVISoftcore::setQuinticUpperLimit( float inputQuinticUpperLimit ){
+    _quinticUpperLimit = inputQuinticUpperLimit;
+    return 0;
+}
+// get quintic upper limit value
+float GpuGBVISoftcore::getQuinticUpperLimit( void ) const {
+    return _quinticUpperLimit;
+}
+// get Born radii scaling method
+int GpuGBVISoftcore::getBornRadiiScalingMethod( void ) const {
+    return _bornRadiiScalingMethod;
+}
+// set Born radii scaling method
+int GpuGBVISoftcore::setBornRadiiScalingMethod( int inputBornRadiiScalingMethod ){
+    _bornRadiiScalingMethod = inputBornRadiiScalingMethod;
+    return 0;
+}
+// get address for SwitchDerivative array on board
+float* GpuGBVISoftcore::getGpuSwitchDerivative( void ) const {
+    return _psSwitchDerivative->_pDevStream[0];
+}
+// get SwitchDerivative array 
+CUDAStream<float>* GpuGBVISoftcore::getSwitchDerivative( void ) const {
+    return _psSwitchDerivative;
+}
+// initialize SwitchDerivative array
+int GpuGBVISoftcore::initializeGpuSwitchDerivative( unsigned int numberOfParticles ){
+    _psSwitchDerivative = new CUDAStream<float>( numberOfParticles, 1, "SwitchDerivative");
+    for( unsigned int ii = 0; ii < numberOfParticles; ii++ ){
+        (*_psSwitchDerivative)[ii] = 1.0f;
+    }   
+    return 0;
+}
+// upload SoftCoreLambda array
+int GpuGBVISoftcore::upload( gpuContext gpu ){
+    if( getBornRadiiScalingMethod() > 0 ){
+        SetCalculateGBVISoftcoreSupplementarySim( this );
+    }
+    return 0;
+}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/GpuGBVISoftcore.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/GpuGBVISoftcore.h
+#ifndef OPENMM_FREE_ENERGY_GPU_GBVI_SOFTCORE_
+#define OPENMM_FREE_ENERGY_GPU_GBVI_SOFTCORE_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "gputypes.h"
+#include "cudatypes.h"
+#include "cudaKernels.h"
+#include "openmm/OpenMMException.h"
+// info related to nonbonded softcore
+class GpuGBVISoftcore {
+    public:
+   /** 
+     * This is an enumeration of the different methods that may be used for scaling of the Born radii.
+     */
+        /**
+         * No scaling method is applied.
+         */
+        static const int NoScaling          = 0;
+        /**
+         * Use the method outlined in Proteins 55, 383-394 (2004), Eq. 6
+         */
+        static const int Tanh               = 1;
+        /**
+         * Use quintic spline scaling function
+         */
+        static const int QuinticSpline      = 2;
+        GpuGBVISoftcore();
+        ~GpuGBVISoftcore();
+        /** 
+         * Set softcore value
+         */
+        int setSoftCoreLambda( float softCoreLambda );
+        /** 
+         * Get softcore value
+         */
+        float getSoftCoreLambda( void ) const;
+        /** 
+         * Set quintic lower limit factor value
+         */
+        int setQuinticLowerLimitFactor( float quinticLowerLimitFactor );
+        /** 
+         * Get quintic lower limit factor value
+         */
+        float getQuinticLowerLimitFactor( void ) const;
+        /** 
+         * Set quintic upper limit value
+         */
+        int setQuinticUpperLimit( float quinticUpperLimit );
+        /** 
+         * Get quintic upper limit value
+         */
+        float getQuinticUpperLimit( void ) const;
+        /** 
+         * Get Born radii scaling method
+         */
+        int getBornRadiiScalingMethod( void ) const;
+        /** 
+         * Set Born radii scaling method
+         */
+        int setBornRadiiScalingMethod( int bornRadiiScalingMethod );
+        // initialize SoftCoreLJLambda particle array
+        int initializeGpuSwitchDerivative( unsigned int numberOfParticles );
+       /** 
+         * Get address for switch derivative array
+         * 
+         * @return address
+         */
+         float* getGpuSwitchDerivative( void ) const;
+       /** 
+         * Get switch derivative array
+         * 
+         * @return address
+         */
+         CUDAStream<float>* getSwitchDerivative( void ) const;
+        /** 
+         * Upload data
+         * 
+         * @return 0 always
+         */
+        int upload( gpuContext gpu );
+    private:
+       float _quinticLowerLimitFactor;
+       float _quinticUpperLimit;
+       unsigned int _bornRadiiScalingMethod;
+       CUDAStream<float>*  _psSwitchDerivative;
+};
+#endif // OPENMM_FREE_ENERGY_GPU_GBVI_SOFTCORE_
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/GpuLJ14Softcore.cpp
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/GpuLJ14Softcore.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "GpuLJ14Softcore.h"
+#include "GpuFreeEnergyCudaKernels.h"
+// GpuLJ14Softcore constructor
+GpuLJ14Softcore::GpuLJ14Softcore( ){
+    psLJ14SoftcoreID          = NULL;
+    psLJ14SoftcoreParameter   = NULL;
+}
+// GpuLJ14Softcore destructor
+GpuLJ14Softcore::~GpuLJ14Softcore( ){
+    delete psLJ14SoftcoreID;
+    delete psLJ14SoftcoreParameter;
+}
+int GpuLJ14Softcore::flipStrides( gpuContext gpu ){
+    int flip = gpu->sim.outputBuffers - 1;
+    for (int ii = 0; ii < psLJ14SoftcoreID->_stride; ii++)
+    {
+        (*psLJ14SoftcoreID)[ii].z = flip - (*psLJ14SoftcoreID)[ii].z;
+        (*psLJ14SoftcoreID)[ii].w = flip - (*psLJ14SoftcoreID)[ii].w;
+    }   
+    psLJ14SoftcoreID->Upload();
+    return 0;
+}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/GpuLJ14Softcore.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/GpuLJ14Softcore.h
+#ifndef OPENMM_FREE_ENERGY_GPU_LJ14_SOFTCORE_
+#define OPENMM_FREE_ENERGY_GPU_LJ14_SOFTCORE_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "gputypes.h"
+struct cudaFreeEnergySimulationNonbonded14 {
+    unsigned int    LJ14s;                          // Number of Lennard Jones 1-4 interactions
+    unsigned int    LJ14_offset;                    // Offset to end of Lennard Jones 1-4 parameters
+    CudaNonbondedMethod nonbondedMethod;            // How to handle nonbonded interactions
+    int4*           pLJ14ID;                        // Lennard Jones 1-4 atom and output buffer IDs
+    float4*         pLJ14Parameter;                 // Lennard Jones 1-4 parameters
+};
+// info related to nonbonded 1-4 softcore
+class GpuLJ14Softcore {
+    public:
+        GpuLJ14Softcore();
+        ~GpuLJ14Softcore();
+        CUDAStream<int4>* psLJ14SoftcoreID;
+        CUDAStream<float4>* psLJ14SoftcoreParameter;
+        cudaFreeEnergySimulationNonbonded14 feSim;
+        int flipStrides(gpuContext gpu);
+    private:
+};
+#endif // OPENMM_FREE_ENERGY_GPU_LJ14_SOFTCORE_
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/GpuNonbondedSoftcore.cpp
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/GpuNonbondedSoftcore.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+using namespace std;
+#include "GpuNonbondedSoftcore.h"
+#include "GpuFreeEnergyCudaKernels.h"
+// GpuNonbondedSoftcore constructor
+GpuNonbondedSoftcore::GpuNonbondedSoftcore( ){
+    _softcoreLJLambda     = 1.0f;
+    _psSoftcoreLJLambda   = NULL;
+}
+GpuNonbondedSoftcore::~GpuNonbondedSoftcore( ){
+    delete _psSoftcoreLJLambda;
+}
+// set global softCoreLJLambda
+int GpuNonbondedSoftcore::setSoftCoreLJLambda( float softCoreLJLambda ){
+    _softcoreLJLambda  = softCoreLJLambda;
+    return 0;
+}
+// get global softCoreLJLambda
+float GpuNonbondedSoftcore::getSoftCoreLJLambda( void ) const {
+    return _softcoreLJLambda;
+}
+// initialize SoftCoreLJLambda particle array
+int GpuNonbondedSoftcore::initializeParticleSoftCoreLJLambda( unsigned int numberOfParticles ){
+    _psSoftcoreLJLambda = new CUDAStream<float>( numberOfParticles, 1, "SoftcoreLJLambda");
+    for( unsigned int ii = 0; ii < numberOfParticles; ii++ ){
+        (*_psSoftcoreLJLambda)[ii] = 1.0f;
+    }
+    return 0;
+}
+// set entry in SoftCoreLJLambda particle array
+int GpuNonbondedSoftcore::setParticleSoftCoreLJLambda( unsigned int particleIndex, float softCoreLJLambda ){
+    (*_psSoftcoreLJLambda)[particleIndex] = softCoreLJLambda;
+    return 0;
+}
+// upload SoftCoreLJLambda array
+int GpuNonbondedSoftcore::upload( gpuContext gpu ){
+    _psSoftcoreLJLambda->Upload();
+    SetCalculateCDLJSoftcoreSupplementarySim( getGpuParticleSoftCoreLJLambda() );
+    SetCalculateCDLJObcGbsaSoftcoreSupplementary1Sim( getGpuParticleSoftCoreLJLambda() );
+    return 0;
+}
+// get address for SoftCoreLJLambda particle array on board
+float* GpuNonbondedSoftcore::getGpuParticleSoftCoreLJLambda( void ) const {
+    return _psSoftcoreLJLambda->_pDevStream[0];
+}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/GpuNonbondedSoftcore.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/GpuNonbondedSoftcore.h
+#ifndef OPENMM_FREE_ENERGY_GPU_NONBONDED_SOFTCORE_
+#define OPENMM_FREE_ENERGY_GPU_NONBONDED_SOFTCORE_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "gputypes.h"
+// info related to nonbonded softcore
+class GpuNonbondedSoftcore {
+    public:
+        GpuNonbondedSoftcore();
+        ~GpuNonbondedSoftcore();
+        /** 
+         * Set softcore value
+         */
+        int setSoftCoreLJLambda( float softCoreLJLambda );
+        /** 
+         * Get softcore value
+         */
+        float getSoftCoreLJLambda( void ) const;
+        /** 
+         * Initialize ParticleSoftCoreLJLambda array
+         * 
+         * @param numberOfParticles number of particles
+         *
+         * @return 0 always
+         */
+        int initializeParticleSoftCoreLJLambda( unsigned int numberOfParticles );
+        /** 
+         * Upload data
+         * 
+         * @param implicitSolvent set if implicit solvent is included in system
+         *
+         * @return 0 always
+         */
+        int upload( gpuContext gpu );
+        /** 
+         * Set particle softCoreLJLambda entry
+         * 
+         * @param particleIndex     index of particle
+         * @param softCoreLJLambda  softCoreLJLambda value
+         *
+         * @return 0 always
+         */
+        int setParticleSoftCoreLJLambda( unsigned int particleIndex, float softCoreLJLambda );
+        /** 
+         * Get address for SoftCoreLJLambda particle array on board
+         * 
+         * @return address
+         */
+         float* getGpuParticleSoftCoreLJLambda( void ) const;
+    private:
+       float _softcoreLJLambda;
+       CUDAStream<float>*  _psSoftcoreLJLambda;
+};
+#endif // OPENMM_FREE_ENERGY_GPU_NONBONDED_SOFTCORE_
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/GpuObcGbsaSoftcore.cpp
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/GpuObcGbsaSoftcore.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+using namespace std;
+#include "GpuObcGbsaSoftcore.h"
+#include "GpuFreeEnergyCudaKernels.h"
+// GpuObcGbsaSoftcore constructor
+GpuObcGbsaSoftcore::GpuObcGbsaSoftcore( ){
+    _psNonPolarScalingFactors   = NULL;
+}
+GpuObcGbsaSoftcore::~GpuObcGbsaSoftcore( ){
+    delete _psNonPolarScalingFactors;
+}
+// initialize NonPolarScalingFactors array
+int GpuObcGbsaSoftcore::initializeNonPolarScalingFactors( unsigned int numberOfParticles ){
+    _psNonPolarScalingFactors = new CUDAStream<float>( numberOfParticles, 1, "ObcSoftcoreNonPolarScalingFactors");
+    for( unsigned int ii = 0; ii < numberOfParticles; ii++ ){
+        (*_psNonPolarScalingFactors)[ii] = 1.0f;
+    }
+    return 0;
+}
+// set entry in NonPolarScalingFactors array
+int GpuObcGbsaSoftcore::setNonPolarScalingFactors( unsigned int particleIndex, float nonPolarScalingFactor ){
+    (*_psNonPolarScalingFactors)[particleIndex] = nonPolarScalingFactor;
+    return 0;
+}
+// upload NonPolarScalingFactors array
+int GpuObcGbsaSoftcore::upload( gpuContext gpu ){
+    _psNonPolarScalingFactors->Upload();
+    SetCalculateObcGbsaSoftcoreNonPolarScalingFactorsSim( getGpuNonPolarScalingFactors() );
+    return 0;
+}
+// get address for NonPolarScalingFactors array on board
+float* GpuObcGbsaSoftcore::getGpuNonPolarScalingFactors( void ) const {
+    return _psNonPolarScalingFactors->_pDevStream[0];
+}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/GpuObcGbsaSoftcore.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/GpuObcGbsaSoftcore.h
+#ifndef OPENMM_FREE_ENERGY_GPU_OBC_GBSA_SOFTCORE_
+#define OPENMM_FREE_ENERGY_GPU_OBC_GBSA_SOFTCORE_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "gputypes.h"
+// info related to nonbonded softcore
+class GpuObcGbsaSoftcore {
+    public:
+        GpuObcGbsaSoftcore();
+        ~GpuObcGbsaSoftcore();
+        /** 
+         * Initialize NonPolarScalingFactors array
+         * 
+         * @param numberOfParticles number of particles
+         *
+         * @return 0 always
+         */
+        int initializeNonPolarScalingFactors( unsigned int numberOfParticles );
+        /** 
+         * Upload data
+         * 
+         * @param implicitSolvent set if implicit solvent is included in system
+         *
+         * @return 0 always
+         */
+        int upload( gpuContext gpu );
+        /** 
+         * Set nonPolarScalingFactor entry
+         * 
+         * @param particleIndex             index of particle
+         * @param nonPolarScalingFactor     nonPolarScalingFactor value
+         *
+         * @return 0 always
+         */
+        int setNonPolarScalingFactors( unsigned int particleIndex, float nonPolarScalingFactor );
+        /** 
+         * Get address for NonPolarScalingFactors array on board
+         * 
+         * @return address
+         */
+         float* getGpuNonPolarScalingFactors( void ) const;
+    private:
+       CUDAStream<float>*  _psNonPolarScalingFactors;
+};
+#endif // OPENMM_FREE_ENERGY_GPU_OBC_GBSA_SOFTCORE_
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaSoftcoreForces1.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaSoftcoreForces1.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+#include "gputypes.h"
+#include "cudatypes.h"
+#include "cudaKernels.h"
+#include "GpuFreeEnergyCudaKernels.h"
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float q;
+    float sig;
+    float eps;
+    float br;
+    float softCoreLJLambda;
+    float fx;
+    float fy;
+    float fz;
+    float fb;
+};
+struct cudaFreeEnergySimulation {
+    float* pParticleSoftCoreLJLambda;
+};
+static __constant__ cudaGmxSimulation cSim;
+static __constant__ cudaFreeEnergySimulation feSimDev;
+void SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( gpuContext gpu )
+{
+    cudaError_t status;
+    (void) fprintf( stderr, "SetCalculateCDLJObcGbsaSoftcoreGpu1Sim gpu=%p cSim=%p sizeof=%u\n", gpu, &gpu->sim, sizeof(cudaGmxSimulation) ); fflush( stderr );
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateCDLJObcGbsaSoftcoreGpu1Sim copy to cSim failed");
+}
+void SetCalculateCDLJObcGbsaSoftcoreSupplementary1Sim( float* gpuParticleSoftCoreLJLambda)
+{
+    cudaError_t status;
+    (void) fprintf( stderr, "SetCalculateCDLJObcGbsaSoftcoreSupplementary1Sim\n" );
+    struct cudaFreeEnergySimulation feSim;
+    feSim.pParticleSoftCoreLJLambda = gpuParticleSoftCoreLJLambda;
+    status = cudaMemcpyToSymbol(feSimDev, &feSim, sizeof(cudaFreeEnergySimulation));
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateCDLJObcGbsaSoftcoreSupplementary1Sim failed");
+}
+void GetCalculateCDLJObcGbsaSoftcoreForces1Sim( gpuContext gpu )
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+texture<float, 1, cudaReadModeElementType> tabulatedErfcRef;
+__device__ float fastErfc(float r)
+{
+    float normalized = cSim.tabulatedErfcScale*r;
+    int index = (int) normalized;
+    float fract2 = normalized-index;
+    float fract1 = 1.0f-fract2;
+    return fract1*tex1Dfetch(tabulatedErfcRef, index) + fract2*tex1Dfetch(tabulatedErfcRef, index+1);
+}
+// Include versions of the kernel for N^2 calculations.
+#if 0
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateCDLJObcGbsaForces1.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateCDLJObcGbsaForces1.h"
+#endif
+// Include versions of the kernel for N^2 calculations with softcore LJ.
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2##b
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_SOFTCORE_LJ
+#include "kCalculateCDLJObcGbsaSoftcoreForces1.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateCDLJObcGbsaSoftcoreForces1.h"
+#undef USE_SOFTCORE_LJ
+// Include versions of the kernel with cutoffs.
+#if 0
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_CUTOFF
+#define METHOD_NAME(a, b) a##Cutoff##b
+#include "kCalculateCDLJObcGbsaForces1.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
+#include "kCalculateCDLJObcGbsaForces1.h"
+// Include versions of the kernel with periodic boundary conditions.
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_PERIODIC
+#define METHOD_NAME(a, b) a##Periodic##b
+#include "kCalculateCDLJObcGbsaForces1.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##PeriodicByWarp##b
+#include "kCalculateCDLJObcGbsaForces1.h"
+// Include versions of the kernels for Ewald
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_PERIODIC
+#define USE_EWALD
+#define METHOD_NAME(a, b) a##Ewald##b
+#include "kCalculateCDLJObcGbsaForces1.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##EwaldByWarp##b
+#include "kCalculateCDLJObcGbsaForces1.h"
+extern __global__ void kFindBlockBoundsCutoff_kernel();
+extern __global__ void kFindBlockBoundsPeriodic_kernel();
+extern __global__ void kFindBlocksWithInteractionsCutoff_kernel();
+extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
+extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*);
+extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*);
+extern __global__ void kCalculateEwaldFastCosSinSums_kernel();
+extern __global__ void kCalculateEwaldFastForces_kernel();
+extern void kCalculatePME(gpuContext gpu);
+#endif
+/**
+ * 
+ * Calculate Born radii and first GBSA loop forces/energy
+ *
+ * @param gpu     gpu contexct
+ * @param gbsaObc if set, calculate Born radii for OBC
+ *                otherwise calculate Born radii for GB/VI
+ *
+ */
+void kCalculateCDLJObcGbsaSoftcoreForces1(gpuContext gpu )
+{
+//    printf("kCalculateCDLJObcGbsaForces1\n");
+    switch (gpu->sim.nonbondedMethod)
+    {
+        case NO_CUTOFF:
+            // use softcore LJ potential
+            if (gpu->bOutputBufferPerWarp)
+                   kCalculateCDLJObcGbsaSoftcoreN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                           sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
+            else
+                   kCalculateCDLJObcGbsaSoftcoreN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                           sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
+            LAUNCHERROR("kCalculateCDLJObcGbsaN2Forces1");
+            break;
+#if 0
+        case CUTOFF:
+            kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsCutoff");
+            kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
+            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
+            kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            if (gpu->bRecalculateBornRadii)
+            {
+                kCalculateObcGbsaBornSum(gpu);
+                kReduceObcGbsaBornSum(gpu);
+            }
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            else
+                kCalculateCDLJObcGbsaCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            LAUNCHERROR("kCalculateCDLJObcGbsaCutoffForces1");
+            break;
+        case PERIODIC:
+            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsPeriodic");
+            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
+            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
+            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            if (gpu->bRecalculateBornRadii)
+            {
+                kCalculateObcGbsaBornSum(gpu);
+                kReduceObcGbsaBornSum(gpu);
+            }
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            else
+                kCalculateCDLJObcGbsaPeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            LAUNCHERROR("kCalculateCDLJObcGbsaPeriodicForces1");
+            break;
+        case EWALD:
+        case PARTICLE_MESH_EWALD:
+            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsPeriodic");
+            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
+            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
+            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
+            if (gpu->bRecalculateBornRadii)
+            {
+                kCalculateObcGbsaBornSum(gpu);
+                kReduceObcGbsaBornSum(gpu);
+            }
+            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+            cudaBindTexture(NULL, &tabulatedErfcRef, gpu->psTabulatedErfc->_pDevData, &channelDesc, gpu->psTabulatedErfc->_length*sizeof(float));
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCDLJObcGbsaEwaldByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            else
+                kCalculateCDLJObcGbsaEwaldForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            LAUNCHERROR("kCalculateCDLJObcGbsaEwaldForces");
+            if (gpu->sim.nonbondedMethod == EWALD)
+            {
+                // Ewald summation
+                kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+                LAUNCHERROR("kCalculateEwaldFastCosSinSums");
+                kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+                LAUNCHERROR("kCalculateEwaldFastForces");
+            }
+            else
+                kCalculatePME(gpu);
+#endif
+    }
+}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaSoftcoreForces1.h
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaSoftcoreForces1.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+/**
+ * This file contains the kernel for evalauating nonbonded forces and the first stage of GBSA.
+ * It is included several times in kCalculateCDLJObcGbsaForces1.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+#ifdef USE_SOFTCORE_LJ
+#include "kSoftcoreLJ.h"
+#endif
+/* Cuda compiler on Windows does not recognized "static const float" values */
+#define LOCAL_HACK_PI 3.1415926535897932384626433832795
+#define COULOMB_ON
+__global__ void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* workUnit)
+{
+    extern __shared__ Atom sA[];
+    unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    unsigned int numWorkUnits = cSim.pInteractionCount[0];
+    unsigned int pos = warp*numWorkUnits/totalWarps;
+    unsigned int end = (warp+1)*numWorkUnits/totalWarps;
+    float CDLJObcGbsa_energy;
+    float energy = 0.0f;
+#ifdef USE_CUTOFF
+    float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
+#endif
+#ifdef USE_EWALD
+    const float TWO_OVER_SQRT_PI = 2.0f/sqrt(LOCAL_HACK_PI);
+#endif
+    unsigned int lasty = -0xFFFFFFFF;
+    while (pos < end)
+    {
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x                      = workUnit[pos];
+        unsigned int y                      = ((x >> 2) & 0x7fff) << GRIDBITS;
+        bool bExclusionFlag                 = (x & 0x1);
+        x                                   = (x >> 17) << GRIDBITS;
+        unsigned int tgx                    = threadIdx.x & (GRID - 1);
+        unsigned int i                      = x + tgx;
+        float4 apos                         = cSim.pPosq[i];
+        float2 a                            = cSim.pAttr[i];
+        float  softCoreLJLambda             = feSimDev.pParticleSoftCoreLJLambda[i];
+        float br                            = cSim.pBornRadii[i];
+        unsigned int tbx                    = threadIdx.x - tgx;
+        unsigned int tj                     = tgx;
+        Atom* psA                           = &sA[tbx];
+        float4 af;
+        af.x                        = 0.0f;
+        af.y                        = 0.0f;
+        af.z                        = 0.0f;
+        af.w                        = 0.0f;
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        {
+            // Read fixed atom data into registers and GRF
+            sA[threadIdx.x].x                    = apos.x;
+            sA[threadIdx.x].y                    = apos.y;
+            sA[threadIdx.x].z                    = apos.z;
+            sA[threadIdx.x].q                    = apos.w;
+            float q2                             = cSim.preFactor * apos.w;
+            apos.w                              *= cSim.epsfac;
+            sA[threadIdx.x].sig                  = a.x;
+            sA[threadIdx.x].eps                  = a.y;
+            sA[threadIdx.x].br                   = br;
+            sA[threadIdx.x].softCoreLJLambda     = softCoreLJLambda;
+            if (!bExclusionFlag)
+            {
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    float dx                = psA[j].x - apos.x;
+                    float dy                = psA[j].y - apos.y;
+                    float dz                = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    float r2                = dx * dx + dy * dy + dz * dz;
+                    float invR              = 1.0f / sqrt(r2);
+                    float sig               = a.x + psA[j].sig;
+                    float eps               = a.y * psA[j].eps;
+#ifdef USE_SOFTCORE_LJ
+                    float dEdR              = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJObcGbsa_energy );
+#else
+                    // CDLJ part
+                    float sig2              = invR * sig;
+                    sig2                   *= sig2;
+                    float sig6              = sig2 * sig2 * sig2;
+                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
+                    /* E */ 
+		              CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
+#endif
+#ifdef USE_CUTOFF
+    #ifdef USE_EWALD
+                    float r                 = sqrt(r2);
+                    float alphaR            = cSim.alphaEwald * r;
+                    float erfcAlphaR        = fastErfc(alphaR);
+                    dEdR                   += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
+		    /* E */
+                    CDLJObcGbsa_energy     += apos.w * psA[j].q * invR * erfcAlphaR;
+    #else
+                    dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+                    /* E */
+                    CDLJObcGbsa_energy     += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
+    #endif
+#else
+#ifdef COULOMB_ON
+                    float factorX           = apos.w * psA[j].q * invR;
+                    dEdR                   += factorX;
+                    CDLJObcGbsa_energy     += factorX;
+#endif
+#endif
+                    dEdR                   *= invR * invR;
+                    // ObcGbsaForce1 part
+                    float alpha2_ij         = br * psA[j].br;
+                    float D_ij              = r2 / (4.0f * alpha2_ij);
+                    float expTerm           = exp(-D_ij);
+                    float denominator2      = r2 + alpha2_ij * expTerm;
+                    float denominator       = sqrt(denominator2);
+                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
+                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
+                    af.w                   += dGpol_dalpha2_ij * psA[j].br;
+                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
+		    /* E */
+		    CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
+#ifdef USE_CUTOFF
+                    if (r2 > cSim.nonbondedCutoffSqr)
+                    {
+                        dEdR = 0.0f;
+ 			/* E */
+			CDLJObcGbsa_energy  = 0.0f;
+                   }
+#endif
+		    /* E */
+                    if (i < cSim.atoms)
+                    {
+                        energy             += 0.5f*CDLJObcGbsa_energy;
+                    }
+                    // Add Forces
+                    dx                     *= dEdR;
+                    dy                     *= dEdR;
+                    dz                     *= dEdR;
+                    af.x                   -= dx;
+                    af.y                   -= dy;
+                    af.z                   -= dz;
+                }
+            }
+            else  // bExclusion
+            {
+                unsigned int xi   = x>>GRIDBITS;
+                unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
+                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    float dx                = psA[j].x - apos.x;
+                    float dy                = psA[j].y - apos.y;
+                    float dz                = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    float r2                = dx * dx + dy * dy + dz * dz;
+                    float invR              = 1.0f / sqrt(r2);
+                    float sig               = a.x + psA[j].sig;
+                    float eps               = a.y * psA[j].eps;
+#ifdef USE_SOFTCORE_LJ
+                    float dEdR              = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJObcGbsa_energy );
+#else
+                    // CDLJ part
+                    float sig2              = invR * sig;
+                    sig2                   *= sig2;
+                    float sig6              = sig2 * sig2 * sig2;
+                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
+                    /* E */ 
+		              CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
+#endif
+#ifdef USE_CUTOFF
+    #ifdef USE_EWALD
+                    float r                 = sqrt(r2);
+                    float alphaR            = cSim.alphaEwald * r;
+                    float erfcAlphaR        = fastErfc(alphaR);
+                    dEdR                   += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
+		    /* E */
+                    CDLJObcGbsa_energy     += apos.w * psA[j].q * invR * erfcAlphaR;
+                    bool needCorrection = !(excl & 0x1) && x+tgx != y+j && x+tgx < cSim.atoms && y+j < cSim.atoms;
+                    if (needCorrection)
+                    {
+                        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
+                        dEdR               = -apos.w * psA[j].q * invR * ((1.0f-erfcAlphaR) - alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
+                        CDLJObcGbsa_energy = -apos.w * psA[j].q * invR * (1.0f-erfcAlphaR);
+                    }
+    #else
+                    dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+                    /* E */
+                    CDLJObcGbsa_energy     += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
+    #endif
+#else
+#ifdef COULOMB_ON
+                    float factorX           = apos.w * psA[j].q * invR;
+                    dEdR                   += factorX;
+                    CDLJObcGbsa_energy     += factorX;
+#endif
+#endif
+                    dEdR                   *= invR * invR;
+#ifdef USE_EWALD
+                    if (!(excl & 0x1) && !needCorrection)
+#else
+                    if (!(excl & 0x1))
+#endif
+                    {
+                        dEdR                = 0.0f;
+                        CDLJObcGbsa_energy  = 0.0f;
+                    }
+                    // ObcGbsaForce1 part
+                    float alpha2_ij         = br * psA[j].br;
+                    float D_ij              = r2 / (4.0f * alpha2_ij);
+                    float expTerm           = exp(-D_ij);
+                    float denominator2      = r2 + alpha2_ij * expTerm;
+                    float denominator       = sqrt(denominator2);
+                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
+                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
+                    af.w                   += dGpol_dalpha2_ij * psA[j].br;
+                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
+                    /* E */
+                    CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
+#if defined USE_PERIODIC
+                    if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+                    {
+                        dEdR               = 0.0f;
+		                  CDLJObcGbsa_energy = 0.0f;
+                    }
+#elif defined USE_CUTOFF
+                    if (r2 > cSim.nonbondedCutoffSqr)
+                    {
+                        dEdR               = 0.0f;
+		                  CDLJObcGbsa_energy = 0.0f;
+                    }
+#endif
+                    if (i < cSim.atoms)
+                    {
+                        energy         += 0.5f*CDLJObcGbsa_energy;
+                    }
+                    // Add Forces
+                    dx                     *= dEdR;
+                    dy                     *= dEdR;
+                    dz                     *= dEdR;
+                    af.x                   -= dx;
+                    af.y                   -= dy;
+                    af.z                   -= dz;
+                    excl                  >>= 1;
+                }
+            }
+            // Write results
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset         = x + tgx + warp*cSim.stride;
+#else
+            unsigned int offset         = x + tgx + (x >> GRIDBITS) * cSim.stride;
+#endif
+            float4 of                   = cSim.pForce4a[offset];
+            float  bf                   = cSim.pBornForce[offset];
+            of.x                       += af.x;
+            of.y                       += af.y;
+            of.z                       += af.z;
+            bf                         += af.w;
+            cSim.pForce4a[offset]       = of;
+            cSim.pBornForce[offset]     = bf;
+        }
+        else        // 100% utilization
+        {
+            // Read fixed atom data into registers and GRF
+            if (lasty != y)
+            {
+                unsigned int j                       = y + tgx;
+                float4 temp                          = cSim.pPosq[j];
+                float2 temp1                         = cSim.pAttr[j];
+                float  temp2                         = feSimDev.pParticleSoftCoreLJLambda[j];
+//float  temp2 = 1.0f;
+                sA[threadIdx.x].br                   = cSim.pBornRadii[j];
+                sA[threadIdx.x].x                    = temp.x;
+                sA[threadIdx.x].y                    = temp.y;
+                sA[threadIdx.x].z                    = temp.z;
+                sA[threadIdx.x].q                    = temp.w;
+                sA[threadIdx.x].sig                  = temp1.x;
+                sA[threadIdx.x].eps                  = temp1.y;
+                sA[threadIdx.x].softCoreLJLambda     = temp2;
+            }
+            sA[threadIdx.x].fx          = 0.0f;
+            sA[threadIdx.x].fy          = 0.0f;
+            sA[threadIdx.x].fz          = 0.0f;
+            sA[threadIdx.x].fb          = 0.0f;
+            float q2                    = apos.w * cSim.preFactor;
+            apos.w                     *= cSim.epsfac;
+            if (!bExclusionFlag)
+            {
+#ifdef USE_CUTOFF
+                unsigned int flags = cSim.pInteractionFlag[pos];
+                if (flags == 0)
+                {
+                    // No interactions in this block.
+                }
+                else if (flags == 0xFFFFFFFF)
+#endif
+                {
+                    // Compute all interactions within this block.
+                    for (unsigned int j = 0; j < GRID; j++)
+                    {
+                        float dx                = psA[tj].x - apos.x;
+                        float dy                = psA[tj].y - apos.y;
+                        float dz                = psA[tj].z - apos.z;
+#ifdef USE_PERIODIC
+                        dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                        dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                        dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                        float r2                = dx * dx + dy * dy + dz * dz;
+                        float invR              = 1.0f / sqrt(r2);
+                        float sig               = a.x + psA[tj].sig;
+                        float eps               = a.y * psA[tj].eps;
+#ifdef USE_SOFTCORE_LJ
+                        float dEdR              = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[tj].softCoreLJLambda, &CDLJObcGbsa_energy );
+#else
+                        // CDLJ part
+                        float sig2              = invR * sig;
+                        sig2                   *= sig2;
+                        float sig6              = sig2 * sig2 * sig2;
+                        float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
+                        /* E */ 
+                        CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
+#endif
+#ifdef USE_CUTOFF
+    #ifdef USE_EWALD
+                        float r                 = sqrt(r2);
+                        float alphaR            = cSim.alphaEwald * r;
+                        float erfcAlphaR        = fastErfc(alphaR);
+                        dEdR                   += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
+                        /* E */
+                        CDLJObcGbsa_energy     += apos.w * psA[tj].q * invR * erfcAlphaR;
+    #else
+                        dEdR                   += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+                        /* E */
+                        CDLJObcGbsa_energy     += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
+    #endif
+#else
+#ifdef COULOMB_ON
+                        float factorX           = apos.w * psA[tj].q * invR;
+                        dEdR                   += factorX;
+                        CDLJObcGbsa_energy     += factorX;
+#endif
+#endif
+                        dEdR                   *= invR * invR;
+                        // ObcGbsaForce1 part
+                        float alpha2_ij         = br * psA[tj].br;
+                        float D_ij              = r2 / (4.0f * alpha2_ij);
+                        float expTerm           = exp(-D_ij);
+                        float denominator2      = r2 + alpha2_ij * expTerm;
+                        float denominator       = sqrt(denominator2);
+                        float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
+                        float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
+                        af.w                   += dGpol_dalpha2_ij * psA[tj].br;
+                        psA[tj].fb             += dGpol_dalpha2_ij * br;
+                        dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
+                        /* E */
+                        CDLJObcGbsa_energy     += (q2 * psA[tj].q) / denominator;
+#ifdef USE_CUTOFF
+                        if (r2 > cSim.nonbondedCutoffSqr)
+                        {
+                            dEdR               = 0.0f;
+       			             CDLJObcGbsa_energy = 0.0f;
+                        }
+#endif
+                        if (i < cSim.atoms)
+                        {
+                            energy            += CDLJObcGbsa_energy;
+                        }
+                        // Add forces
+                        dx                     *= dEdR;
+                        dy                     *= dEdR;
+                        dz                     *= dEdR;
+                        af.x                   -= dx;
+                        af.y                   -= dy;
+                        af.z                   -= dz;
+                        psA[tj].fx             += dx;
+                        psA[tj].fy             += dy;
+                        psA[tj].fz             += dz;
+                        tj                      = (tj + 1) & (GRID - 1);
+                    }
+                }
+#ifdef USE_CUTOFF
+                else
+                {
+                    // Compute only a subset of the interactions in this block.
+                    for (unsigned int j = 0; j < GRID; j++)
+                    {
+                        if ((flags&(1<<j)) != 0)
+                        {
+                            float dx                = psA[j].x - apos.x;
+                            float dy                = psA[j].y - apos.y;
+                            float dz                = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                            dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                            dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                            dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                            float r2                = dx * dx + dy * dy + dz * dz;
+                            float invR              = 1.0f / sqrt(r2);
+                            float sig               = a.x + psA[j].sig;
+                            float eps               = a.y * psA[j].eps;
+#ifdef USE_SOFTCORE_LJ
+                            float dEdR              = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJObcGbsa_energy );
+#else
+                            // CDLJ part
+                            float sig2              = invR * sig;
+                            sig2                   *= sig2;
+                            float sig6              = sig2 * sig2 * sig2;
+                            float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
+                            /* E */ 
+                            CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
+#endif
+#ifdef USE_CUTOFF
+    #ifdef USE_EWALD
+                            float r                 = sqrt(r2);
+                            float alphaR            = cSim.alphaEwald * r;
+                            float erfcAlphaR        = fastErfc(alphaR);
+                            dEdR                   += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
+                            CDLJObcGbsa_energy     += apos.w * psA[j].q * invR * erfcAlphaR;
+    #else
+                            dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+                            /* E */
+                            CDLJObcGbsa_energy     += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
+    #endif
+#else
+#ifdef COULOMB_ON
+                            float factorX           = apos.w * psA[j].q * invR;
+                            dEdR                   += factorX;
+                            CDLJObcGbsa_energy     += factorX;
+#endif
+#endif
+                            dEdR                   *= invR * invR;
+                            // ObcGbsaForce1 part
+                            float alpha2_ij         = br * psA[j].br;
+                            float D_ij              = r2 / (4.0f * alpha2_ij);
+                            float expTerm           = exp(-D_ij);
+                            float denominator2      = r2 + alpha2_ij * expTerm;
+                            float denominator       = sqrt(denominator2);
+                            float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
+                            float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
+                            af.w                   += dGpol_dalpha2_ij * psA[j].br;
+                            dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
+                            /* E */
+                            CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
+                            // Sum the Born forces.
+                            tempBuffer[threadIdx.x] = dGpol_dalpha2_ij * br;
+                            if (tgx % 2 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
+                            if (tgx % 4 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
+                            if (tgx % 8 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
+                            if (tgx % 16 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
+                            if (tgx == 0)
+                                psA[j].fb += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
+#ifdef USE_CUTOFF
+                            if (r2 > cSim.nonbondedCutoffSqr)
+                            {
+                                dEdR                = 0.0f;
+				                    CDLJObcGbsa_energy  = 0.0f;
+                            }
+#endif
+                            if (i < cSim.atoms)
+                            {
+                                energy         += CDLJObcGbsa_energy;
+                            }
+                            // Add forces
+                            dx                     *= dEdR;
+                            dy                     *= dEdR;
+                            dz                     *= dEdR;
+                            af.x                   -= dx;
+                            af.y                   -= dy;
+                            af.z                   -= dz;
+                            tempBuffer[threadIdx.x] = dx;
+                            if (tgx % 2 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
+                            if (tgx % 4 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
+                            if (tgx % 8 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
+                            if (tgx % 16 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
+                            if (tgx == 0)
+                                psA[j].fx += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
+                            tempBuffer[threadIdx.x] = dy;
+                            if (tgx % 2 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
+                            if (tgx % 4 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
+                            if (tgx % 8 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
+                            if (tgx % 16 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
+                            if (tgx == 0)
+                                psA[j].fy += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
+                            tempBuffer[threadIdx.x] = dz;
+                            if (tgx % 2 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
+                            if (tgx % 4 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
+                            if (tgx % 8 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
+                            if (tgx % 16 == 0)
+                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
+                            if (tgx == 0)
+                                psA[j].fz += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
+                        }
+                    }
+                }
+#endif
+            }
+            else  // bExclusion
+            {
+                unsigned int xi   = x>>GRIDBITS;
+                unsigned int yi   = y>>GRIDBITS;
+                unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
+                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
+                excl              = (excl >> tgx) | (excl << (GRID - tgx));
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    float dx                = psA[tj].x - apos.x;
+                    float dy                = psA[tj].y - apos.y;
+                    float dz                = psA[tj].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    float r2                = dx * dx + dy * dy + dz * dz;
+                    float invR              = 1.0f / sqrt(r2);
+                    float sig               = a.x + psA[tj].sig;
+                    float eps               = a.y * psA[tj].eps;
+#ifdef USE_SOFTCORE_LJ
+                    float dEdR              = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[tj].softCoreLJLambda, &CDLJObcGbsa_energy );
+#else
+                    // CDLJ part
+                    float sig2              = invR * sig;
+                    sig2                   *= sig2;
+                    float sig6              = sig2 * sig2 * sig2;
+                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
+                    /* E */ 
+		              CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
+#endif
+#ifdef USE_CUTOFF
+    #ifdef USE_EWALD
+                    float r                 = sqrt(r2);
+                    float alphaR            = cSim.alphaEwald * r;
+                    float erfcAlphaR        = fastErfc(alphaR);
+                    dEdR                   += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
+                    /* E */
+                    CDLJObcGbsa_energy     += apos.w * psA[tj].q * invR * erfcAlphaR;
+                    bool needCorrection = !(excl & 0x1) && x+tgx != y+tj && x+tgx < cSim.atoms && y+tj < cSim.atoms;
+                    if (needCorrection)
+                    {
+                        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
+                        dEdR               = -apos.w * psA[tj].q * invR * ((1.0f-erfcAlphaR) - alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
+                        CDLJObcGbsa_energy = -apos.w * psA[tj].q * invR * (1.0f-erfcAlphaR);
+                    }
+    #else
+                    dEdR                   += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+                    /* E */
+                    CDLJObcGbsa_energy     += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
+    #endif
+#else
+#ifdef COULOMB_ON
+                    float factorX           = apos.w * psA[tj].q * invR;
+                    dEdR                   += factorX;
+                    CDLJObcGbsa_energy     += factorX;
+#endif
+#endif
+                    dEdR                   *= invR * invR;
+#ifdef USE_EWALD
+                    if (!(excl & 0x1) && !needCorrection)
+#else
+                    if (!(excl & 0x1))
+#endif
+                    {
+                        dEdR               = 0.0f;
+			               CDLJObcGbsa_energy = 0.0f;
+                    }
+                    // ObcGbsaForce1 part
+                    float alpha2_ij         = br * psA[tj].br;
+                    float D_ij              = r2 / (4.0f * alpha2_ij);
+                    float expTerm           = exp(-D_ij);
+                    float denominator2      = r2 + alpha2_ij * expTerm;
+                    float denominator       = sqrt(denominator2);
+                    float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
+                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
+                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;
+                    psA[tj].fb             += dGpol_dalpha2_ij * br;
+                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
+		              CDLJObcGbsa_energy     += (q2 * psA[tj].q) / denominator;
+#if defined USE_PERIODIC
+                    if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+                    {
+                        dEdR               = 0.0f;
+			               CDLJObcGbsa_energy = 0.0f;
+                    }
+#elif defined USE_CUTOFF
+                    if (r2 > cSim.nonbondedCutoffSqr)
+                    {
+                        dEdR = 0.0f;
+			               CDLJObcGbsa_energy = 0.0f;
+                    }
+#endif
+                    if (i < cSim.atoms)
+                    {
+                        energy             += CDLJObcGbsa_energy;
+                    }
+                    // Add forces
+                    dx                     *= dEdR;
+                    dy                     *= dEdR;
+                    dz                     *= dEdR;
+                    af.x                   -= dx;
+                    af.y                   -= dy;
+                    af.z                   -= dz;
+                    psA[tj].fx             += dx;
+                    psA[tj].fy             += dy;
+                    psA[tj].fz             += dz;
+                    excl                  >>= 1;
+                    tj                      = (tj + 1) & (GRID - 1);
+                }
+            }
+            // Write results
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset         = x + tgx + warp*cSim.stride;
+#else
+            unsigned int offset         = x + tgx + (y >> GRIDBITS) * cSim.stride;
+#endif
+            float4 of                   = cSim.pForce4a[offset];
+            float  bf                   = cSim.pBornForce[offset];
+            of.x                       += af.x;
+            of.y                       += af.y;
+            of.z                       += af.z;
+            bf                         += af.w;
+            cSim.pForce4a[offset]       = of;
+            cSim.pBornForce[offset]     = bf;
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            offset                      = y + tgx + warp*cSim.stride;
+#else
+            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
+#endif
+            of                          = cSim.pForce4a[offset];
+            bf                          = cSim.pBornForce[offset];
+            of.x                       += sA[threadIdx.x].fx;
+            of.y                       += sA[threadIdx.x].fy;
+            of.z                       += sA[threadIdx.x].fz;
+            bf                         += sA[threadIdx.x].fb;
+            cSim.pForce4a[offset]       = of;
+            cSim.pBornForce[offset]     = bf;
+#if 0
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            unsigned int offset         = x + tgx + warp*cSim.stride;
+            float4 of                   = cSim.pForce4a[offset];
+            of.x                       += af.x;
+            of.y                       += af.y;
+            of.z                       += af.z;
+            of.w                       += af.w;
+            cSim.pForce4a[offset]       = of; 
+            cSim.pBornForce[offset]     = af.w;
+            offset                      = y + tgx + warp*cSim.stride;
+            of                          = cSim.pForce4a[offset];
+            of.x                       += sA[threadIdx.x].fx;
+            of.y                       += sA[threadIdx.x].fy;
+            of.z                       += sA[threadIdx.x].fz;
+            of.w                       += sA[threadIdx.x].fb;
+            cSim.pForce4a[offset]       = of; 
+            cSim.pBornForce[offset]     = af.w;
+#else
+            unsigned int offset         = x + tgx + (y >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]       = af; 
+            cSim.pBornForce[offset]     = af.w;
+            af.x                        = sA[threadIdx.x].fx;
+            af.y                        = sA[threadIdx.x].fy;
+            af.z                        = sA[threadIdx.x].fz;
+            af.w                        = sA[threadIdx.x].fb;
+            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]       = af; 
+            cSim.pBornForce[offset]     = af.w;
+#endif
+#endif
+            lasty = y;
+        }
+        pos++;
+    }
+    cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+}
--- a/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.cu
+++ b/plugins/freeEnergy/platforms/cuda/src/kernels/kCalculateGBVISoftcoreBornSum.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include "GpuGBVISoftcore.h"
+#include "GpuFreeEnergyCudaKernels.h"
+struct cudaFreeEnergySimulationGBVI {
+    float quinticLowerLimitFactor;
+    float quinticUpperLimit;
+    float* pSwitchDerivative;
+};
+struct cudaFreeEnergySimulationGBVI gbviSim;
+static __constant__ cudaGmxSimulation cSim;
+static __constant__ cudaFreeEnergySimulationGBVI gbviSimDev;
+void SetCalculateGBVISoftcoreBornSumGpuSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateGBVISoftcoreBornSumGpuSim copy to cSim failed");
+    (void) fprintf( stderr, "SetCalculateGBVISoftcoreBornSumGpuSim\n" );
+}
+void GetCalculateGBVISoftcoreBornSumSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+void SetCalculateGBVISoftcoreSupplementarySim( GpuGBVISoftcore* gpuGBVISoftcore )
+{
+    cudaError_t status;
+    gbviSim.pSwitchDerivative        = gpuGBVISoftcore->getGpuSwitchDerivative();
+    gbviSim.quinticLowerLimitFactor  = gpuGBVISoftcore->getQuinticLowerLimitFactor();
+    gbviSim.quinticUpperLimit        = gpuGBVISoftcore->getQuinticUpperLimit();
+    status                           = cudaMemcpyToSymbol(gbviSimDev, &gbviSim, sizeof(cudaFreeEnergySimulationGBVI));
+    RTERROR(status, "cudaMemcpyToSymbol: SetCalculateGBVISoftcoreSupplementarySim");
+    (void) fprintf( stderr, "SetCalculateGBVISoftcoreSupplementarySim %14.6e %14.6e swDerv=%p\n",
+                    gbviSim.quinticLowerLimitFactor, gbviSim.quinticUpperLimit, gbviSim.pSwitchDerivative );
+}
+// create, initialize and enter BornRadiusScaleFactors values (used to scale contribution of atoms to Born sum of other atoms)
+// return handle to GpuGBVISoftcore object
+extern "C"
+GpuGBVISoftcore* gpuSetGBVISoftcoreParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom,
+                                              const std::vector<float>& radius, const std::vector<float>& gamma,
+                                              const std::vector<float>& scaledRadii, const std::vector<float>& bornRadiusScaleFactors,
+                                              const std::vector<float>& quinticSplineParameters)
+{
+    static const float electricConstant         = -166.02691f;
+    unsigned int atoms                          = atom.size();
+    double tau                                  = ((1.0f/innerDielectric)-(1.0f/solventDielectric)); 
+    // create gpuGBVISoftcore, load parameters, and track minimum softcore value
+    // gpuGBVISoftcore is not really being used (it was in the initial implementation) -- 
+    // will be removed in future once confirmed not needed
+    GpuGBVISoftcore* gpuGBVISoftcore            = new GpuGBVISoftcore();
+    unsigned int numberOfParticles              = radius.size();
+    // check if quintic scaling to be applied
+    if( quinticSplineParameters.size() == 2 ){
+       gpuGBVISoftcore->setBornRadiiScalingMethod( 1 );
+       gpuGBVISoftcore->setQuinticLowerLimitFactor( quinticSplineParameters[0] );
+       gpuGBVISoftcore->setQuinticUpperLimit(       quinticSplineParameters[1] );
+       gpuGBVISoftcore->initializeGpuSwitchDerivative(  gpu->sim.paddedNumberOfAtoms );
+    }
+    for (unsigned int i = 0; i < bornRadiusScaleFactors.size(); i++) 
+    {
+            (*gpu->psGBVIData)[i].x = radius[i];
+            (*gpu->psGBVIData)[i].y = scaledRadii[i];
+            (*gpu->psGBVIData)[i].z = tau*gamma[i];
+            (*gpu->psGBVIData)[i].w = bornRadiusScaleFactors[i];
+(*gpu->psObcData)[i].x  = radius[i];
+(*gpu->psObcData)[i].y  = 0.9f*radius[i];
+    }
+    // Dummy out extra atom data
+    for (unsigned int i = atoms; i < gpu->sim.paddedNumberOfAtoms; i++)
+    {
+        (*gpu->psBornRadii)[i]      = 0.2f;
+        (*gpu->psGBVIData)[i].x     = 0.01f;
+        (*gpu->psGBVIData)[i].y     = 0.01f;
+        (*gpu->psGBVIData)[i].z     = 0.01f;
+        (*gpu->psGBVIData)[i].w     = 1.00f;
+    }
+#undef DUMP_PARAMETERS
+#define DUMP_PARAMETERS 1
+#if (DUMP_PARAMETERS == 1)
+    (void) fprintf( stderr,"GBVI softcore param %u %u sclMeth=%d LwFct=%8.3f UpLmt=[%12.5e (nm) %12.5e]\nR scaledR gamma*tau= bornRadiusScaleFactor \n",
+                    bornRadiusScaleFactors.size(), gpu->sim.paddedNumberOfAtoms,
+                    gpuGBVISoftcore->getBornRadiiScalingMethod(), gpuGBVISoftcore->getQuinticLowerLimitFactor(),
+                    powf( gpuGBVISoftcore->getQuinticUpperLimit(), -0.3333333f ), gpuGBVISoftcore->getQuinticUpperLimit() );
+    int maxPrint = 31;
+    for (unsigned int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++) 
+    {
+        (void) fprintf( stderr,"%6u %14.7e %14.7e %14.7e %14.7e\n",
+                        ii, (*gpu->psGBVIData)[ii].x, (*gpu->psGBVIData)[ii].y, (*gpu->psGBVIData)[ii].z, (*gpu->psGBVIData)[ii].w ); 
+        if( ii == maxPrint ){
+            ii = gpu->sim.paddedNumberOfAtoms - maxPrint;
+            if( ii < maxPrint )ii = maxPrint;
+        }
+    }
+#endif
+    gpu->psBornRadii->Upload();
+    gpu->psGBVIData->Upload();
+gpu->psObcData->Upload();
+    gpu->sim.preFactor              = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
+    gpuGBVISoftcore->upload( gpu );
+#if (DUMP_PARAMETERS == 1)
+(void) fprintf( stderr, "gpuSetGBVISoftcoreParameters: preFactor=%14.6e elecCnstnt=%.4f frcCnvrsnFctr=%.4f tau=%.4f.\n",
+                gpu->sim.preFactor, 2.0f*electricConstant, gpu->sim.forceConversionFactor, ((1.0f/innerDielectric)-(1.0f/solventDielectric)) );
+#endif
+    return gpuGBVISoftcore;
+}
+// delete gpuGBVISoftcore
+extern "C"
+void gpuDeleteGBVISoftcoreParameters( void* gpuGBVISoftcore)
+{
+   delete gpuGBVISoftcore;
+}
+struct Atom {
+    float x;
+    float y;
+    float z;
+    float r;
+    float sr;
+    float sum;
+    float gamma;
+    float bornRadiusScaleFactor;
+};
+// Include versions of the kernels for N^2 calculations.
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateGBVISoftcoreBornSum.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateGBVISoftcoreBornSum.h"
+// Include versions of the kernels with cutoffs.
+#if 0
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_CUTOFF
+#define METHOD_NAME(a, b) a##Cutoff##b
+#include "kCalculateGBVISoftcoreBornSum.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
+#include "kCalculateGBVISoftcoreBornSum.h"
+// Include versions of the kernels with periodic boundary conditions.
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_PERIODIC
+#define METHOD_NAME(a, b) a##Periodic##b
+#include "kCalculateGBVISoftcoreBornSum.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##PeriodicByWarp##b
+#include "kCalculateGBVISoftcoreBornSum.h"
+#endif
+#if 0
+__global__ void kClearGBVISoftcoreBornSum_kernel()
+{
+    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
+    while (pos < cSim.stride * cSim.nonbondOutputBuffers)
+    {
+        ((float*)cSim.pBornSum)[pos] = 0.0f;
+        pos += gridDim.x * blockDim.x;
+    }
+}
+#endif
+ __device__ void quinticSpline( float  x, float rl, float ru, float* outValue, float* outDerivative )
+{
+   float numerator    = x  - rl;
+   float denominator  = ru - rl;
+   float ratio        = numerator/denominator;
+   float ratio2       = ratio*ratio;
+   float ratio3       = ratio2*ratio;
+   *outValue          = 1.0f + ratio3*(-10.f + 15.0f*ratio - 6.0f*ratio2);
+   *outDerivative     = ratio2*(-30.0f + 60.0f*ratio - 30.0f*ratio2)/denominator;
+}
+__global__ void kReduceGBVIBornSumQuinticScaling_kernel()
+{
+    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
+    while (pos < cSim.atoms)
+    {
+        float sum = 0.0f;
+        float* pSt = cSim.pBornSum + pos;
+        float4 atom = cSim.pGBVIData[pos];
+        // Get summed Born data
+        for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
+        {
+            sum += *pSt;
+       //     printf("%4d %4d A: %9.4f\n", pos, i, *pSt);
+            pSt += cSim.stride;
+        }
+        // Now calculate Born radius
+        float Rinv           = 1.0f/atom.x;
+        float r3             = Rinv*Rinv*Rinv;
+        float splineL        = gbviSimDev.quinticLowerLimitFactor*r3;
+//float bSum           = sum;
+        float switchDeriviative;
+        if( sum > splineL ){
+            if( sum < r3 ){
+                float splineValue, splineDerivative;
+                quinticSpline( sum, splineL, r3, &splineValue, &splineDerivative ); 
+                switchDeriviative  = splineValue - (r3 - sum)*splineDerivative;
+                sum                = (r3 - sum)*splineValue + gbviSimDev.quinticUpperLimit;
+            } else {
+                sum                = gbviSimDev.quinticUpperLimit;
+                switchDeriviative  = 0.0f;
+            }
+        } else {
+            sum               = r3 - sum;
+            switchDeriviative = 1.0f;
+        }
+        cSim.pBornRadii[pos]              = pow( sum, (-1.0f/3.0f) ); 
+//cSim.pBornSum[pos]              = bSum;
+        gbviSimDev.pSwitchDerivative[pos] = switchDeriviative;
+        pos += gridDim.x * blockDim.x;
+    }   
+}
+void kReduceGBVIBornSumQuinticScaling(gpuContext gpu, GpuGBVISoftcore* gpuGBVISoftcore)
+{
+    //printf("kReduceGBVIBornSumQuinticScaling_kernel\n");
+    kReduceGBVIBornSumQuinticScaling_kernel<<<gpu->sim.blocks, 384>>>();
+    gpu->bRecalculateBornRadii = false;
+    LAUNCHERROR("kReduceGBVIBornSumQuinticScaling_kernel");
+#define GBVI_DEBUG 0
+#if ( GBVI_DEBUG == 1 )
+               gpu->psGBVIData->Download();
+               gpu->psBornSum->Download();
+               gpu->psBornRadii->Download();
+               gpu->psPosq4->Download();
+               CUDAStream<float>* psSwitchDerivative = gpuGBVISoftcore->getSwitchDerivative();
+               psSwitchDerivative->Download();
+                (void) fprintf( stderr, "\nkReduceGBVIBornSumQuinticScaling: Post BornSum %s Born radii & params\n", 
+                               (gpu->bIncludeGBVI ? "GBVI" : "Obc") );
+                for( int ii = 0; ii < gpu->natoms; ii++ ){
+                   (void) fprintf( stderr, "%6d bSum=%14.6e bR=%14.6e swDerv=%14.6e param[%14.6e %14.6e %14.6e] x[%14.6f %14.6f %14.6f %14.6f] %s\n",
+                                   ii, 
+                                   gpu->psBornSum->_pSysStream[0][ii],
+                                   gpu->psBornRadii->_pSysStream[0][ii],
+                                   psSwitchDerivative->_pSysStream[0][ii],
+                                   gpu->psGBVIData->_pSysStream[0][ii].x,
+                                   gpu->psGBVIData->_pSysStream[0][ii].y,
+                                   gpu->psGBVIData->_pSysStream[0][ii].z,
+                                   gpu->psPosq4->_pSysStream[0][ii].x, gpu->psPosq4->_pSysStream[0][ii].y,
+                                   gpu->psPosq4->_pSysStream[0][ii].z, gpu->psPosq4->_pSysStream[0][ii].w,
+                                   (fabs( psSwitchDerivative->_pSysStream[0][ii] - 1.0 ) > 1.0e-05 ? "SWWWWW" : "")
+                                 );  
+                }   
+#endif
+#undef GBVI_DEBUG
+}
+__global__ void kReduceGBVIBornForcesQuinticScaling_kernel()
+{
+    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
+    float energy = 0.0f;
+    while (pos < cSim.atoms)
+    {
+        float bornRadius    = cSim.pBornRadii[pos];
+        float4 gbviData     = cSim.pGBVIData[pos];
+        float  switchDeriv  = gbviSimDev.pSwitchDerivative[pos];
+        float totalForce    = 0.0f;
+        float* pFt          = cSim.pBornForce + pos;
+        int i = cSim.nonbondOutputBuffers;
+        while (i >= 4)
+        {
+            float f1    = *pFt;
+            pFt        += cSim.stride;
+            float f2    = *pFt;
+            pFt        += cSim.stride;
+            float f3    = *pFt;
+            pFt        += cSim.stride;
+            float f4    = *pFt;
+            pFt        += cSim.stride;
+            totalForce += f1 + f2 + f3 + f4;
+            i -= 4;
+        }
+        if (i >= 2)
+        {
+            float f1    = *pFt;
+            pFt        += cSim.stride;
+            float f2    = *pFt;
+            pFt        += cSim.stride;
+            totalForce += f1 + f2;
+            i -= 2;
+        }
+        if (i > 0)
+        {
+            totalForce += *pFt;
+        }
+        float ratio         = (gbviData.x/bornRadius);
+        float ratio3        = ratio*ratio*ratio;
+        energy             -= gbviData.z*ratio3;
+        totalForce         += (3.0f*gbviData.z*ratio3)/bornRadius; // 'cavity' term
+        float br2           = bornRadius*bornRadius;
+        totalForce         *= (1.0f/3.0f)*br2*br2*switchDeriv;
+        pFt = cSim.pBornForce + pos;
+        *pFt = totalForce;
+        pos += gridDim.x * blockDim.x;
+    }
+    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy;
+}
+void kReduceGBVIBornForcesQuinticScaling(gpuContext gpu)
+{
+    //printf("kReduceObcGbsaBornForces\n");
+    kReduceGBVIBornForcesQuinticScaling_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
+    LAUNCHERROR("kReduceGBVIBornForcesQuinticScaling");
+}
+void kCalculateGBVISoftcoreBornSum(gpuContext gpu)
+{
+    //printf("kCalculateGBVIBornSum\n");
+    kClearGBVIBornSum( gpu );
+    LAUNCHERROR("kClearGBVIBornSum from kCalculateGBVISoftcoreBornSum");
+    size_t numWithInteractions;
+    switch (gpu->sim.nonbondedMethod)
+    {
+        case NO_CUTOFF:
+#define GBVI 0
+#if GBVI == 1
+int maxPrint = 31;
+gpu->psWorkUnit->Download();
+fprintf( stderr, "kCalculateGBVISoftcoreBornSum: bOutputBufferPerWarp=%u blks=%u th/blk=%u wu=%u %u shrd=%u\n", gpu->bOutputBufferPerWarp,
+                 gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, gpu->sim.workUnits, gpu->psWorkUnit->_pSysStream[0][0],
+        sizeof(Atom)*gpu->sim.nonbond_threads_per_block );
+               gpu->psGBVIData->Download();
+               gpu->psBornSum->Download();
+               gpu->psPosq4->Download();
+                (void) fprintf( stderr, "\nkCalculateGBVISoftcoreBornSum: pre BornSum %s Born radii & params\n",
+                               (gpu->bIncludeGBVI ? "GBVI" : "Obc") );
+                for( int ii = 0; ii < gpu->natoms; ii++ ){
+                   (void) fprintf( stderr, "%6d bSum=%14.6e param[%14.6e %14.6e %14.6e] x[%14.6f %14.6f %14.6f %14.6f]\n",
+                                   ii,
+                                   gpu->psBornSum->_pSysStream[0][ii],
+                                   gpu->psGBVIData->_pSysStream[0][ii].x,
+                                   gpu->psGBVIData->_pSysStream[0][ii].y,
+                                   gpu->psGBVIData->_pSysStream[0][ii].z,
+                                   gpu->psPosq4->_pSysStream[0][ii].x, gpu->psPosq4->_pSysStream[0][ii].y,
+                                   gpu->psPosq4->_pSysStream[0][ii].z, gpu->psPosq4->_pSysStream[0][ii].w
+                                 );
+                   if( (ii == maxPrint) && ( ii < (gpu->natoms - maxPrint)) ){
+                      ii = gpu->natoms - maxPrint;
+                   }
+                }
+#endif
+#undef GBVI
+            if (gpu->bOutputBufferPerWarp){
+                kCalculateGBVISoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
+            } else {
+                kCalculateGBVISoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
+            }
+            break;
+#if 0
+        case CUTOFF:
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateGBVICutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
+            else
+                kCalculateGBVICutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
+            break;
+        case PERIODIC:
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateGBVIPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
+            else
+                kCalculateGBVIPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
+            break;
+#endif
+    }
+    LAUNCHERROR("kCalculateGBVIBornSum");
+}