Checking in Cuda implementation of explicit solvent

1010df33 · Peter Eastman · df4b64cb · 1010df33 · 1010df33 · 1010df33
Commit 1010df33 authored Feb 19, 2009 by Peter Eastman
20 changed files
--- a/platforms/cuda/CMakeLists.txt
+++ b/platforms/cuda/CMakeLists.txt
-#---------------------------------------------------
-# OpenMM CUDA Platform
-#
-# Creates OpenMM library, base name=OpenMMCuda.
-# Default libraries are shared & optimized. Variants
-# are created for static (_static) and debug (_d).
-#
-# Windows:
-#   OpenMMCuda[_d].dll
-#   OpenMMCuda[_d].lib
-#   OpenMMCuda_static[_d].lib
-# Unix:
-#   libOpenMMCuda[_d].so
-#   libOpenMMCuda_static[_d].a
-#----------------------------------------------------
-
-SUBDIRS (tests)
-
-# The source is organized into subdirectories, but we handle them all from
-# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
-SET(OPENMM_SOURCE_SUBDIRS .)
-
-
-# Collect up information about the version of the OpenMM library we're building
-# and make it available to the code so it can be built into the binaries.
-
-SET(OPENMMCUDA_LIBRARY_NAME OpenMMCuda)
-
-SET(SHARED_TARGET ${OPENMMCUDA_LIBRARY_NAME})
-SET(STATIC_TARGET ${OPENMMCUDA_LIBRARY_NAME}_static)
-
-
-# Ensure that debug libraries have "_d" appended to their names.
-# CMake gets this right on Windows automatically with this definition.
-IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
-    SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
-ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
-
-# But on Unix or Cygwin we have to add the suffix manually
-IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
-    SET(SHARED_TARGET ${SHARED_TARGET}_d)
-    SET(STATIC_TARGET ${STATIC_TARGET}_d)
-ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
-
-
-# These are all the places to search for header files which are
-# to be part of the API.
-SET(API_INCLUDE_DIRS) # start empty
-FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
-    # append
-    SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS}
-                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include 
-                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal)
-ENDFOREACH(subdir)
-
-# We'll need both *relative* path names, starting with their API_INCLUDE_DIRS,
-# and absolute pathnames.
-SET(API_REL_INCLUDE_FILES)   # start these out empty
-SET(API_ABS_INCLUDE_FILES)
-
-FOREACH(dir ${API_INCLUDE_DIRS})
-    FILE(GLOB fullpaths ${dir}/*.h)	# returns full pathnames
-    SET(API_ABS_INCLUDE_FILES ${API_ABS_INCLUDE_FILES} ${fullpaths})
-
-    FOREACH(pathname ${fullpaths})
-        GET_FILENAME_COMPONENT(filename ${pathname} NAME)
-        SET(API_REL_INCLUDE_FILES ${API_REL_INCLUDE_FILES} ${dir}/${filename})
-    ENDFOREACH(pathname)
-ENDFOREACH(dir)
-
-# collect up source files
-SET(SOURCE_FILES) # empty
-SET(SOURCE_INCLUDE_FILES)
-
-FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
-    FILE(GLOB src_files  ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*/*.cpp)
-    FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h)
-    SET(SOURCE_FILES         ${SOURCE_FILES}         ${src_files})   #append
-    SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files})
-    INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
-ENDFOREACH(subdir)
-
-INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
-
-SET(FINDCUDA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake)
-
-SUBDIRS (sharedTarget staticTarget)
+#---------------------------------------------------
+# OpenMM CUDA Platform
+#
+# Creates OpenMM library, base name=OpenMMCuda.
+# Default libraries are shared & optimized. Variants
+# are created for static (_static) and debug (_d).
+#
+# Windows:
+#   OpenMMCuda[_d].dll
+#   OpenMMCuda[_d].lib
+#   OpenMMCuda_static[_d].lib
+# Unix:
+#   libOpenMMCuda[_d].so
+#   libOpenMMCuda_static[_d].a
+#----------------------------------------------------
+
+SUBDIRS (tests)
+
+# The source is organized into subdirectories, but we handle them all from
+# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
+SET(OPENMM_SOURCE_SUBDIRS .)
+
+
+# Collect up information about the version of the OpenMM library we're building
+# and make it available to the code so it can be built into the binaries.
+
+SET(OPENMMCUDA_LIBRARY_NAME OpenMMCuda)
+
+SET(SHARED_TARGET ${OPENMMCUDA_LIBRARY_NAME})
+SET(STATIC_TARGET ${OPENMMCUDA_LIBRARY_NAME}_static)
+
+
+# Ensure that debug libraries have "_d" appended to their names.
+# CMake gets this right on Windows automatically with this definition.
+IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+    SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
+ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
+
+# But on Unix or Cygwin we have to add the suffix manually
+IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+    SET(SHARED_TARGET ${SHARED_TARGET}_d)
+    SET(STATIC_TARGET ${STATIC_TARGET}_d)
+ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
+
+
+# These are all the places to search for header files which are
+# to be part of the API.
+SET(API_INCLUDE_DIRS) # start empty
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    # append
+    SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS}
+                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include 
+                         ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal)
+ENDFOREACH(subdir)
+
+# We'll need both *relative* path names, starting with their API_INCLUDE_DIRS,
+# and absolute pathnames.
+SET(API_REL_INCLUDE_FILES)   # start these out empty
+SET(API_ABS_INCLUDE_FILES)
+
+FOREACH(dir ${API_INCLUDE_DIRS})
+    FILE(GLOB fullpaths ${dir}/*.h)	# returns full pathnames
+    SET(API_ABS_INCLUDE_FILES ${API_ABS_INCLUDE_FILES} ${fullpaths})
+
+    FOREACH(pathname ${fullpaths})
+        GET_FILENAME_COMPONENT(filename ${pathname} NAME)
+        SET(API_REL_INCLUDE_FILES ${API_REL_INCLUDE_FILES} ${dir}/${filename})
+    ENDFOREACH(pathname)
+ENDFOREACH(dir)
+
+# collect up source files
+SET(SOURCE_FILES) # empty
+SET(SOURCE_INCLUDE_FILES)
+
+FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
+    FILE(GLOB src_files  ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*/*.cpp)
+    FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h)
+    SET(SOURCE_FILES         ${SOURCE_FILES}         ${src_files})   #append
+    SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files})
+    INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
+ENDFOREACH(subdir)
+
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+
+SET(FINDCUDA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake)
+
+IF (APPLE)
+    LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/cudpp/mac)
+ELSE (APPLE)
+    IF (WIN32)
+        LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/cudpp/win)
+        INSTALL_FILES(/lib FILES ${CMAKE_CURRENT_SOURCE_DIR}/cudpp/win/cudpp32.dll)
+    ELSE (WIN32)
+        LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/cudpp/linux)
+    ENDIF (WIN32)
+ENDIF(APPLE)
+SUBDIRS (sharedTarget staticTarget)
--- a/platforms/cuda/cudpp/mac/libcudpp.a
+++ b/platforms/cuda/cudpp/mac/libcudpp.a
--- a/platforms/cuda/cudpp/mac/libcutil.a
+++ b/platforms/cuda/cudpp/mac/libcutil.a
--- a/platforms/cuda/cudpp/win/cudpp.lib
+++ b/platforms/cuda/cudpp/win/cudpp.lib
--- a/platforms/cuda/cudpp/win/cudpp32.dll
+++ b/platforms/cuda/cudpp/win/cudpp32.dll
--- a/platforms/cuda/cudpp/win/cutil.lib
+++ b/platforms/cuda/cudpp/win/cutil.lib
--- a/platforms/cuda/src/CudaStreamFactory.cpp
+++ b/platforms/cuda/src/CudaStreamFactory.cpp
@@ -39,42 +39,40 @@
 using namespace OpenMM;

 StreamImpl* CudaStreamFactory::createStreamImpl(std::string name, int size, Stream::DataType type, const Platform& platform, OpenMMContextImpl& context) const {
+    CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
    if (name == "particlePositions") {
-        CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
        float padding[] = {100000.0f, 100000.0f, 100000.0f, 0.2f};
        return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psPosq4, 4, padding, data.gpu);
    }
    if (name == "particleVelocities") {
-        CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
        float padding[] = {0.0f, 0.0f, 0.0f, 0.0f};
        return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psVelm4, 4, padding, data.gpu);
    }
    if (name == "particleForces") {
-        CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
        float padding[] = {0.0f, 0.0f, 0.0f, 0.0f};
        return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psForce4, 4, padding, data.gpu);
    }
    switch (type) {
    case Stream::Float:
    case Stream::Double:
-        return new CudaStreamImpl<float1>(name, size, type, platform, 1, NULL);
+        return new CudaStreamImpl<float1>(name, size, type, platform, 1, data.gpu);
    case Stream::Float2:
    case Stream::Double2:
-        return new CudaStreamImpl<float2>(name, size, type, platform, 1, NULL);
+        return new CudaStreamImpl<float2>(name, size, type, platform, 1, data.gpu);
    case Stream::Float3:
    case Stream::Double3:
-        return new CudaStreamImpl<float3>(name, size, type, platform, 1, NULL);
+        return new CudaStreamImpl<float3>(name, size, type, platform, 1, data.gpu);
    case Stream::Float4:
    case Stream::Double4:
-        return new CudaStreamImpl<float4>(name, size, type, platform, 1, NULL);
+        return new CudaStreamImpl<float4>(name, size, type, platform, 1, data.gpu);
    case Stream::Integer:
-        return new CudaStreamImpl<int1>(name, size, type, platform, 1, NULL);
+        return new CudaStreamImpl<int1>(name, size, type, platform, 1, data.gpu);
    case Stream::Integer2:
-        return new CudaStreamImpl<int2>(name, size, type, platform, 1, NULL);
+        return new CudaStreamImpl<int2>(name, size, type, platform, 1, data.gpu);
    case Stream::Integer3:
-        return new CudaStreamImpl<int3>(name, size, type, platform, 1, NULL);
+        return new CudaStreamImpl<int3>(name, size, type, platform, 1, data.gpu);
    case Stream::Integer4:
-        return new CudaStreamImpl<int4>(name, size, type, platform, 1, NULL);
+        return new CudaStreamImpl<int4>(name, size, type, platform, 1, data.gpu);
    }
    throw OpenMMException("Tried to create a Stream with an illegal DataType.");
 }
--- a/platforms/cuda/src/CudaStreamImpl.h
+++ b/platforms/cuda/src/CudaStreamImpl.h
@@ -55,7 +55,7 @@ public:
 private:
    void initType();
    CUDAStream<T>* stream;
-	 _gpuContext* gpu;
+    _gpuContext* gpu;
    bool ownStream;
    int width, rowOffset;
    float paddingValues[4];
@@ -132,58 +132,60 @@ CudaStreamImpl<T>::~CudaStreamImpl() {
 template <class T>
 void CudaStreamImpl<T>::loadFromArray(const void* array) {
    float* data = reinterpret_cast<float*>(stream->_pSysData);
+    int* order = gpu->psAtomIndex->_pSysData;
    if (baseType == Stream::Float) {
        float* arrayData = (float*) array;
        for (int i = 0; i < getSize(); ++i)
            for (int j = 0; j < width; ++j)
-                data[i*rowOffset+j] = arrayData[i*width+j];
+                data[i*rowOffset+j] = arrayData[order[i]*width+j];
    }
    else if (baseType == Stream::Double) {
        double* arrayData = (double*) array;
        for (int i = 0; i < getSize(); ++i)
            for (int j = 0; j < width; ++j)
-                data[i*rowOffset+j] = (float) arrayData[i*width+j];
+                data[i*rowOffset+j] = (float) arrayData[order[i]*width+j];
    }
    else {
        int* arrayData = (int*) array;
        for (int i = 0; i < getSize(); ++i)
            for (int j = 0; j < width; ++j)
-                data[i*rowOffset+j] = (float) arrayData[i*width+j];
+                data[i*rowOffset+j] = (float) arrayData[order[i]*width+j];
    }
    for (int i = getSize(); i < (int) stream->_length; ++i)
        for (int j = 0; j < rowOffset; ++j)
            data[i*rowOffset+j] = paddingValues[j];
    stream->Upload();

-	 // VisualStudio compiler did not like stream == gpu->psPosq4 
-	 //if( gpu && stream == gpu->psPosq4 ){
+    // VisualStudio compiler did not like stream == gpu->psPosq4
+    //if( gpu && stream == gpu->psPosq4 ){

-	 if( gpu && getName() == "particlePositions" ){
-	    gpu->bRecalculateBornRadii = true;
-	 }
+    if( gpu && getName() == "particlePositions" ){
+        gpu->bRecalculateBornRadii = true;
+    }
 }

 template <class T>
 void CudaStreamImpl<T>::saveToArray(void* array) {
    stream->Download();
    float* data = reinterpret_cast<float*>(stream->_pSysData);
+    int* order = gpu->psAtomIndex->_pSysData;
    if (baseType == Stream::Float) {
        float* arrayData = (float*) array;
        for (int i = 0; i < getSize(); ++i)
            for (int j = 0; j < width; ++j)
-                arrayData[i*width+j] = data[i*rowOffset+j];
+                arrayData[order[i]*width+j] = data[i*rowOffset+j];
    }
    else if (baseType == Stream::Double) {
        double* arrayData = (double*) array;
        for (int i = 0; i < getSize(); ++i)
            for (int j = 0; j < width; ++j)
-                arrayData[i*width+j] = data[i*rowOffset+j];
+                arrayData[order[i]*width+j] = data[i*rowOffset+j];
    }
    else {
        int* arrayData = (int*) array;
        for (int i = 0; i < getSize(); ++i)
            for (int j = 0; j < width; ++j)
-                arrayData[i*width+j] = (int) data[i*rowOffset+j];
+                arrayData[order[i]*width+j] = (int) data[i*rowOffset+j];
    }
 }


--- a/platforms/cuda/src/kernels/cudaKernels.h
+++ b/platforms/cuda/src/kernels/cudaKernels.h
@@ -41,19 +41,19 @@ extern void kGenerateRandoms(gpuContext gpu);
 extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu);
 extern void kCalculateCDLJObcGbsaForces1_12(gpuContext gpu);
 extern void kCalculateCDLJForces(gpuContext gpu);
-extern void kCalculateCDLJForces_12(gpuContext gpu);
 extern void kCalculateObcGbsaForces1(gpuContext gpu);
 extern void kCalculateObcGbsaForces1_12(gpuContext gpu);
 extern void kReduceObcGbsaBornForces(gpuContext gpu);
 extern void kCalculateObcGbsaForces2(gpuContext gpu);
-extern void kCalculateObcGbsaForces2_12(gpuContext gpu);
 extern void kCalculateLocalForces(gpuContext gpu);
 extern void kCalculateAndersenThermostat(gpuContext gpu);
 extern void kReduceBornSumAndForces(gpuContext gpu);
 extern void kUpdatePart1(gpuContext gpu);
 extern void kApplyFirstShake(gpuContext gpu);
+extern void kApplyFirstSettle(gpuContext gpu);
 extern void kUpdatePart2(gpuContext gpu);
 extern void kApplySecondShake(gpuContext gpu);
+extern void kApplySecondSettle(gpuContext gpu);
 extern void kVerletUpdatePart1(gpuContext gpu);
 extern void kVerletUpdatePart2(gpuContext gpu);
 extern void kBrownianUpdatePart1(gpuContext gpu);
@@ -66,12 +66,8 @@ extern void kClearBornForces(gpuContext gpu);
 // Initializers
 extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
 extern void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
-extern void SetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu);
-extern void GetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu);
 extern void SetCalculateCDLJForcesSim(gpuContext gpu);
 extern void GetCalculateCDLJForcesSim(gpuContext gpu);
-extern void SetCalculateCDLJForces_12Sim(gpuContext gpu);
-extern void GetCalculateCDLJForces_12Sim(gpuContext gpu);
 extern void SetCalculateLocalForcesSim(gpuContext gpu);
 extern void GetCalculateLocalForcesSim(gpuContext gpu);
 extern void SetCalculateObcGbsaBornSumSim(gpuContext gpu);
@@ -82,14 +78,14 @@ extern void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
 extern void GetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
 extern void SetCalculateObcGbsaForces2Sim(gpuContext gpu);
 extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu);
-extern void SetCalculateObcGbsaForces2_12Sim(gpuContext gpu);
-extern void GetCalculateObcGbsaForces2_12Sim(gpuContext gpu);
 extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
 extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
 extern void SetForcesSim(gpuContext gpu);
 extern void GetForcesSim(gpuContext gpu);
 extern void SetUpdateShakeHSim(gpuContext gpu);
 extern void GetUpdateShakeHSim(gpuContext gpu);
+extern void SetSettleSim(gpuContext gpu);
+extern void GetSettleSim(gpuContext gpu);
 extern void SetVerletUpdateSim(gpuContext gpu);
 extern void GetVerletUpdateSim(gpuContext gpu);
 extern void SetBrownianUpdateSim(gpuContext gpu);

--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -36,11 +36,12 @@
 #include <limits>
 #include <iostream>
 #include <stdio.h>
+#include <stdlib.h>
+#include <string>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <builtin_types.h>
 #include <vector_functions.h>
-using namespace std;

 #define RTERROR(status, s) \
    if (status != cudaSuccess) { \
@@ -122,7 +123,7 @@ CUDAStream<T>::~CUDAStream()
 template <typename T>
 void CUDAStream<T>::Allocate()
 {
-     cudaError_t status;
+    cudaError_t status;
    _pSysStream =   new T*[_subStreams];
    _pDevStream =   new T*[_subStreams];
    _pSysData =     new T[_subStreams * _stride];
@@ -228,6 +229,12 @@ static const int GT2XX_RANDOM_THREADS_PER_BLOCK         = 384;
 static const int G8X_NONBOND_WORKUNITS_PER_SM           = 220;
 static const int GT2XX_NONBOND_WORKUNITS_PER_SM         = 256;

+enum CudaNonbondedMethod
+{
+    NO_CUTOFF,
+    CUTOFF,
+    PERIODIC
+};

 struct cudaGmxSimulation {
    // Constants
@@ -236,6 +243,7 @@ struct cudaGmxSimulation {
    unsigned int    blocks;                         // Number of blocks to launch across linear kernels
    unsigned int    nonbond_blocks;                 // Number of blocks to launch across CDLJ and Born Force Part1
    unsigned int    bornForce2_blocks;              // Number of blocks to launch across Born Force 2
+    unsigned int    interaction_blocks;             // Number of blocks to launch when identifying interacting tiles
    unsigned int    threads_per_block;              // Threads per block to launch
    unsigned int    nonbond_threads_per_block;      // Threads per block in nonbond kernel calls
    unsigned int    bornForce2_threads_per_block;   // Threads per block in nonbond kernel calls
@@ -245,12 +253,17 @@ struct cudaGmxSimulation {
    unsigned int    bsf_reduce_threads_per_block;   // Threads per block in Born Sum And Forces reduction calls
    unsigned int    max_shake_threads_per_block;    // Maximum threads per block in shake kernel calls
    unsigned int    shake_threads_per_block;        // Threads per block in shake kernel calls
+    unsigned int    settle_threads_per_block;       // Threads per block in SETTLE kernel calls
    unsigned int    nonshake_threads_per_block;     // Threads per block in nonshaking kernel call
    unsigned int    max_localForces_threads_per_block;  // Threads per block in local forces kernel calls
    unsigned int    localForces_threads_per_block;  // Threads per block in local forces kernel calls
    unsigned int    random_threads_per_block;       // Threads per block in RNG kernel calls
+    unsigned int    interaction_threads_per_block;  // Threads per block when identifying interacting tiles
    unsigned int    workUnits;                      // Number of work units
    unsigned int*   pWorkUnit;                      // Pointer to work units
+    unsigned int*   pInteractingWorkUnit;           // Pointer to work units that have interactions
+    unsigned int*   pInteractionFlag;               // Flags for which work units have interactions
+    size_t*         pInteractionCount;              // A count of the number of work units which have interactions
    unsigned int    nonbond_workBlock;              // Number of work units running simultaneously per block in CDLJ and Born Force Part 1
    unsigned int    bornForce2_workBlock;           // Number of work units running second half of Born Forces calculation
    unsigned int    workUnitsPerSM;                 // Number of workblocks per SM
@@ -270,6 +283,12 @@ struct cudaGmxSimulation {
    unsigned int    outputBuffers;                  // Number of output buffers
    float           bigFloat;                       // Floating point value used as a flag for Shaken atoms 
    float           epsfac;                         // Epsilon factor for CDLJ calculations
+    CudaNonbondedMethod nonbondedMethod;            // How to handle nonbonded interactions
+    float           nonbondedCutoffSqr;             // Cutoff distance for CDLJ calculations
+    float           periodicBoxSizeX;               // The X dimension of the periodic box
+    float           periodicBoxSizeY;               // The Y dimension of the periodic box
+    float           periodicBoxSizeZ;               // The Z dimension of the periodic box
+    float           reactionFieldK;                 // Constant for reaction field correction
    float           probeRadius;                    // SASA probe radius
    float           surfaceAreaFactor;              // ACE approximation surface area factor
    float           electricConstant;               // ACE approximation electric constant
@@ -326,6 +345,7 @@ struct cudaGmxSimulation {
    float4*         pLJ14Parameter;                 // Lennard Jones 1-4 parameters
    float           inverseTotalMass;               // Used in linear momentum removal
    unsigned int    ShakeConstraints;               // Total number of Shake constraints
+    unsigned int    settleConstraints;              // Total number of Settle constraints
    unsigned int    NonShakeConstraints;            // Total number of NonShake atoms
    unsigned int    maxShakeIterations;             // Maximum shake iterations
    unsigned int    degreesOfFreedom;               // Number of degrees of freedom in system
@@ -334,12 +354,17 @@ struct cudaGmxSimulation {
    int*            pNonShakeID;                    // Not Shaking atoms
    int4*           pShakeID;                       // Shake atoms and phase
    float4*         pShakeParameter;                // Shake parameters
+    int4*           pSettleID;                      // Settle atoms
+    float2*         pSettleParameter;               // Settle parameters
    unsigned int*   pExclusion;                     // Nonbond exclusion data
    unsigned int    bond_offset;                    // Offset to end of bonds
    unsigned int    bond_angle_offset;              // Offset to end of bond angles
    unsigned int    dihedral_offset;                // Offset to end of dihedrals
    unsigned int    rb_dihedral_offset;             // Offset to end of Ryckaert Bellemans dihedrals
    unsigned int    LJ14_offset;                    // Offset to end of Lennard Jones 1-4 parameters
+    int*            pAtomIndex;                     // The original index of each atom
+    float4*         pGridBoundingBox;               // The size of each grid cell
+    float4*         pGridCenter;                    // The center of each grid cell

    // Mutable stuff
    float4*         pPosq;                          // Pointer to atom positions and charges

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -30,6 +30,7 @@
 * -------------------------------------------------------------------------- */

 #include <stdio.h>
+#include <string.h>
 #include <cuda.h>
 #include <vector_functions.h>
 #include <cstdlib>
@@ -40,6 +41,7 @@
 #include <ctime>
 #include <cmath>
 #include <map>
+#include <algorithm>
 #ifdef WIN32
  #include <windows.h>
 #else
@@ -148,6 +150,23 @@ struct ShakeCluster {
    }
 };

+struct Constraint
+{
+    Constraint(int atom1, int atom2, float distance2) : atom1(atom1), atom2(atom2), distance2(distance2) {
+    }
+    int atom1, atom2;
+    float distance2;
+};
+
+struct Molecule {
+    vector<int> atoms;
+    vector<int> bonds;
+    vector<int> angles;
+    vector<int> periodicTorsions;
+    vector<int> rbTorsions;
+    vector<int> constraints;
+};
+
 static const float dielectricOffset         =    0.009f;
 static const float PI                       =    3.1415926535f;
 static const float probeRadius              =    0.14f;
@@ -793,7 +812,7 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname)
                }
        }
        cout << total_exclusions << " total exclusions.\n";
-        gpuSetCoulombParameters(gpu, epsfac, atom, c6, c12, q, symbol, exclusions);
+        gpuSetCoulombParameters(gpu, epsfac, atom, c6, c12, q, symbol, exclusions, NO_CUTOFF);
        gpuSetObcParameters(gpu, defaultInnerDielectric, defaultSolventDielectric, atom, radius, scale);
        return coulombs;
    }
@@ -807,11 +826,12 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname)

 extern "C"
 void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& atom, const vector<float>& c6, const vector<float>& c12, const vector<float>& q,
-        const vector<char>& symbol, const vector<vector<int> >& exclusions)
+        const vector<char>& symbol, const vector<vector<int> >& exclusions, CudaNonbondedMethod method)
 {
    unsigned int coulombs = atom.size();
    gpu->sim.epsfac = epsfac;
-    unsigned int total_exclusions = 0;
+    gpu->sim.nonbondedMethod = method;
+    gpu->exclusions = exclusions;

    for (unsigned int i = 0; i < coulombs; i++)
    {
@@ -827,34 +847,6 @@ void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& at
            gpu->psPosq4->_pSysStream[0][i].w = p0;
            gpu->psSigEps2->_pSysStream[0][i].x = p1;
            gpu->psSigEps2->_pSysStream[0][i].y = p2;
-
-#if (DUMP_PARAMETERS == 1)
-        cout << 
-            i << " " << 
-            gpu->psPosq4->_pSysStream[0][i].w << " " << 
-            gpu->psSigEps2->_pSysStream[0][i].x << " " <<
-            gpu->psSigEps2->_pSysStream[0][i].y << " " << 
-            p0 << " " <<
-            p1 << " " <<
-            p2 << " " <<
-            exclusions;
-#endif
-            for (int j = 0; j < (int) exclusions[i].size(); j++)
-            {
-#if (DUMP_PARAMETERS == 1)
-                cout << " " << exclusions[i][j];
-#endif
-
-                gpu->pExclusion[i * gpu->sim.paddedNumberOfAtoms + exclusions[i][j]] = 0;
-                if (i >= (int) exclusions[i][j])
-                {
-                    total_exclusions++;
-                }
-            }
-#if (DUMP_PARAMETERS == 1)
-            cout << endl;
-#endif
-
    }

    // Dummy out extra atom data
@@ -868,36 +860,31 @@ void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& at
        gpu->psSigEps2->_pSysStream[0][i].y     = 0.0f;   
    }

-    // Add in remaining exclusions
-    for (unsigned int i = coulombs; i < gpu->sim.paddedNumberOfAtoms; i++)
-    {
-        for (unsigned int j = 0; j < gpu->sim.paddedNumberOfAtoms; j++)
-        {
-            gpu->pExclusion[i * gpu->sim.paddedNumberOfAtoms + j] = 0;
-            gpu->pExclusion[j * gpu->sim.paddedNumberOfAtoms + i] = 0;
-        }
-    }
-
    gpu->psPosq4->Upload();
    gpu->psSigEps2->Upload();
+}

-    // Check for exclusion consistency
-    for (unsigned int i = 0; i < coulombs; i++)
-    {
-        for (unsigned int j = i; j < coulombs; j++)
-        {
+extern "C"
+void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDielectric)
+{
+    gpu->sim.nonbondedCutoffSqr = cutoffDistance*cutoffDistance;
+    gpu->sim.reactionFieldK = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0f)/(2.0f*solventDielectric+1.0f);
+}

-            if (gpu->pExclusion[i * gpu->sim.paddedNumberOfAtoms + j] != gpu->pExclusion[j * gpu->sim.paddedNumberOfAtoms + i])
-                cout << "Warning: inconsistent exclusion betweens atoms " << i << " and " << j << endl;
-        }
-    }
+extern "C"
+void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize)
+{
+    gpu->sim.periodicBoxSizeX = xsize;
+    gpu->sim.periodicBoxSizeY = ysize;
+    gpu->sim.periodicBoxSizeZ = zsize;
 }

 extern "C"
 void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const vector<int>& atom, const vector<float>& radius, const vector<float>& scale)
 {
    unsigned int atoms = atom.size();
-    
+
+    gpu->bIncludeGBSA = true;
    for (unsigned int i = 0; i < atoms; i++)
    {
            gpu->psObcData->_pSysStream[0][i].x = radius[i] - dielectricOffset;
@@ -973,11 +960,91 @@ void gpuSetShakeParameters(gpuContext gpu, const vector<int>& atom1, const vecto
        constraintCount[atom1[i]]++;
        constraintCount[atom2[i]]++;
    }
-    
+
+    // Identify clusters of three atoms that can be treated with SETTLE.  First, for every
+    // atom that might be part of such a cluster, make a list of the two other atoms it is
+    // connected to.
+
+    vector<map<int, float> > settleConstraints(gpu->natoms);
+    for (int i = 0; i < atom1.size(); i++) {
+        if (constraintCount[atom1[i]] == 2 && constraintCount[atom2[i]] == 2) {
+            settleConstraints[atom1[i]][atom2[i]] = distance[i];
+            settleConstraints[atom2[i]][atom1[i]] = distance[i];
+        }
+    }
+
+    // Now remove the ones that don't actually form closed loops of three atoms.
+
+    vector<int> settleClusters;
+    for (int i = 0; i < settleConstraints.size(); i++) {
+        if (settleConstraints[i].size() == 2) {
+            int partner1 = settleConstraints[i].begin()->first;
+            int partner2 = (++settleConstraints[i].begin())->first;
+            if (settleConstraints[partner1].size() != 2 || settleConstraints[partner2].size() != 2 ||
+                    settleConstraints[partner1].find(partner2) == settleConstraints[partner1].end())
+                settleConstraints[i].clear();
+            else if (i < partner1 && i < partner2)
+                settleClusters.push_back(i);
+        }
+        else
+            settleConstraints[i].clear();
+    }
+
+    // Record the actual SETTLE clusters.
+
+    CUDAStream<int4>* psSettleID          = new CUDAStream<int4>((int) settleClusters.size(), 1);
+    gpu->psSettleID                       = psSettleID;
+    gpu->sim.pSettleID                    = psSettleID->_pDevStream[0];
+    CUDAStream<float2>* psSettleParameter = new CUDAStream<float2>((int) settleClusters.size(), 1);
+    gpu->psSettleParameter                = psSettleParameter;
+    gpu->sim.pSettleParameter             = psSettleParameter->_pDevStream[0];
+    gpu->sim.settleConstraints            = settleClusters.size();
+    for (int i = 0; i < settleClusters.size(); i++) {
+        int atom1 = settleClusters[i];
+        int atom2 = settleConstraints[atom1].begin()->first;
+        int atom3 = (++settleConstraints[atom1].begin())->first;
+        float dist12 = settleConstraints[atom1].find(atom2)->second;
+        float dist13 = settleConstraints[atom1].find(atom3)->second;
+        float dist23 = settleConstraints[atom2].find(atom3)->second;
+        if (dist12 == dist13) { // atom1 is the central atom
+            psSettleID->_pSysData[i].x = atom1;
+            psSettleID->_pSysData[i].y = atom2;
+            psSettleID->_pSysData[i].z = atom3;
+            psSettleParameter->_pSysData[i].x = dist12;
+            psSettleParameter->_pSysData[i].y = dist23;
+        }
+        else if (dist12 == dist23) { // atom2 is the central atom
+            psSettleID->_pSysData[i].x = atom2;
+            psSettleID->_pSysData[i].y = atom1;
+            psSettleID->_pSysData[i].z = atom3;
+            psSettleParameter->_pSysData[i].x = dist12;
+            psSettleParameter->_pSysData[i].y = dist13;
+        }
+        else if (dist13 == dist23) { // atom3 is the central atom
+            psSettleID->_pSysData[i].x = atom3;
+            psSettleID->_pSysData[i].y = atom1;
+            psSettleID->_pSysData[i].z = atom2;
+            psSettleParameter->_pSysData[i].x = dist13;
+            psSettleParameter->_pSysData[i].y = dist12;
+        }
+        else
+            throw OpenMMException("Two of the three distances constrained with SETTLE must be the same.");
+    }
+    psSettleID->Upload();
+    psSettleParameter->Upload();
+    gpu->sim.settle_threads_per_block     = (gpu->sim.settleConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
+    if (gpu->sim.settle_threads_per_block > gpu->sim.max_shake_threads_per_block)
+        gpu->sim.settle_threads_per_block = gpu->sim.max_shake_threads_per_block;
+    if (gpu->sim.settle_threads_per_block < 1)
+        gpu->sim.settle_threads_per_block = 1;
+
    // Find clusters consisting of a central atom with up to three peripheral atoms.
    
    map<int, ShakeCluster> clusters;
    for (int i = 0; i < atom1.size(); i++) {
+        if (settleConstraints[atom1[i]].size() == 2)
+            continue; // This is being taken care of with SETTLE.
+
        // Determine which is the central atom.
        
        bool firstIsCentral;
@@ -1096,9 +1163,6 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
    gpu->sim.degreesOfFreedom           = 3 * gpu->sim.atoms - 6;
    gpu->gpAtomTable                    = NULL;
    gpu->gAtomTypes                     = 0;
-    gpu->sim.nonbondOutputBuffers       = gpu->sim.paddedNumberOfAtoms / GRID;
-    gpu->sim.totalNonbondOutputBuffers  = 2 * gpu->sim.nonbondOutputBuffers;
-    gpu->sim.outputBuffers              = gpu->sim.totalNonbondOutputBuffers;
    gpu->psPosq4                        = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
    gpu->sim.stride                     = gpu->psPosq4->_stride;
    gpu->sim.stride2                    = gpu->sim.stride * 2;
@@ -1129,10 +1193,14 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
    gpu->psObcData                      = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1);
    gpu->sim.pObcData                   = gpu->psObcData->_pDevStream[0];
    gpu->pAtomSymbol                    = new unsigned char[gpu->natoms];
-
+    gpu->psAtomIndex                    = new CUDAStream<int>(gpu->sim.paddedNumberOfAtoms, 1);
+    gpu->sim.pAtomIndex                 = gpu->psAtomIndex->_pDevStream[0];
+    for (int i = 0; i < (int) gpu->sim.paddedNumberOfAtoms; i++)
+        gpu->psAtomIndex->_pSysStream[0][i] = i;
+    gpu->psAtomIndex->Upload();
    // Determine randoms
    gpu->seed                           = (unsigned long)time(NULL) & 0x000fffff;
-    gpu->sim.randomFrames               = 995;
+    gpu->sim.randomFrames               = 95;
    gpu->sim.randomIterations           = gpu->sim.randomFrames;
    gpu->sim.randoms                    = gpu->sim.randomFrames * gpu->sim.paddedNumberOfAtoms - 5 * GRID;
    gpu->sim.totalRandoms               = gpu->sim.randoms + gpu->sim.paddedNumberOfAtoms;
@@ -1351,8 +1419,8 @@ void* gpuInit(int numAtoms)
    {
        sscanf(pAdapter, "%d", &device);
    }
-    cudaError_t status = cudaSetDevice(device);
-    RTERROR(status, "Error setting CUDA device")
+//    cudaError_t status = cudaSetDevice(device);
+//    RTERROR(status, "Error setting CUDA device")

    // Determine which core to run on
 #if 0
@@ -1445,6 +1513,8 @@ void* gpuInit(int numAtoms)
    gpu->sim.probeRadius            = probeRadius;
    gpu->sim.surfaceAreaFactor      = surfaceAreaFactor;
    gpu->sim.electricConstant       = electricConstant;
+    gpu->sim.nonbondedMethod        = NO_CUTOFF;
+    gpu->sim.nonbondedCutoffSqr     = 0.0f;

    gpu->sim.bigFloat               = 99999999.0f;
    gpu->sim.forceConversionFactor  = forceConversionFactor;
@@ -1461,6 +1531,7 @@ void* gpuInit(int numAtoms)
    gpu->bCalculateCM               = false;
    gpu->bRemoveCM                  = false;
    gpu->bRecalculateBornRadii      = true;
+    gpu->bIncludeGBSA               = false;
    gpuInitializeRandoms(gpu);

    // To be determined later
@@ -1489,18 +1560,20 @@ void* gpuInit(int numAtoms)
    gpu->psLJ14Parameter            = NULL;
    gpu->psShakeID                  = NULL;
    gpu->psShakeParameter           = NULL;
+    gpu->psSettleID                 = NULL;
+    gpu->psSettleParameter          = NULL;
    gpu->psExclusion                = NULL;
    gpu->psWorkUnit                 = NULL;
+    gpu->psInteractingWorkUnit      = NULL;
+    gpu->psInteractionFlag          = NULL;
+    gpu->psInteractionCount         = NULL;
+    gpu->psGridBoundingBox          = NULL;
+    gpu->psGridCenter               = NULL;


    // Initialize output buffer before reading parameters
    gpu->pOutputBufferCounter       = new unsigned int[gpu->sim.paddedNumberOfAtoms];
    memset(gpu->pOutputBufferCounter, 0, gpu->sim.paddedNumberOfAtoms * sizeof(unsigned int));
-    
-    // Initialize exclusion array
-    gpu->pExclusion = new unsigned int[gpu->sim.paddedNumberOfAtoms * gpu->sim.paddedNumberOfAtoms];
-    for (unsigned int i = 0; i < gpu->sim.paddedNumberOfAtoms * gpu->sim.paddedNumberOfAtoms; i++)
-        gpu->pExclusion[i] = 1;

    return (void*)gpu;
 }
@@ -1586,7 +1659,6 @@ void gpuShutDown(gpuContext gpu)
 {
    // Delete sysmem pointers
    delete[] gpu->pOutputBufferCounter;
-    delete[] gpu->pExclusion;
    delete[] gpu->gpAtomTable;
    delete[] gpu->pAtomSymbol;

@@ -1620,13 +1692,23 @@ void gpuShutDown(gpuContext gpu)
    delete gpu->psLJ14Parameter;
    delete gpu->psShakeID;
    delete gpu->psShakeParameter;
+    delete gpu->psSettleID;
+    delete gpu->psSettleParameter;
    delete gpu->psExclusion;
    delete gpu->psWorkUnit;
+    delete gpu->psInteractingWorkUnit;
+    delete gpu->psInteractionFlag;
+    delete gpu->psInteractionCount;
    delete gpu->psRandom4;
    delete gpu->psRandom2;
    delete gpu->psRandomPosition;    
    delete gpu->psRandomSeed;
    delete gpu->psLinearMomentum;
+    delete gpu->psAtomIndex;
+    delete gpu->psGridBoundingBox;
+    delete gpu->psGridCenter;
+    if (gpu->cudpp != 0)
+        cudppDestroyPlan(gpu->cudpp);

    // Wrap up
    delete gpu;
@@ -1636,6 +1718,19 @@ void gpuShutDown(gpuContext gpu)
 extern "C"
 int gpuBuildOutputBuffers(gpuContext gpu)
 {
+    // Select the number of output buffer to use.
+    gpu->bOutputBufferPerWarp           = true;
+    gpu->sim.nonbondOutputBuffers       = gpu->sim.nonbond_blocks * gpu->sim.nonbond_threads_per_block / GRID;
+    if (gpu->sim.nonbondOutputBuffers >= gpu->sim.paddedNumberOfAtoms/GRID)
+    {
+        // For small systems, it is more efficient to have one output buffer per block of 32 atoms instead of one per warp.
+        gpu->bOutputBufferPerWarp           = false;
+        gpu->sim.nonbondOutputBuffers       = gpu->sim.paddedNumberOfAtoms / GRID;
+    }
+    gpu->sim.totalNonbondOutputBuffers  = (gpu->bIncludeGBSA ? 2 * gpu->sim.nonbondOutputBuffers : gpu->sim.nonbondOutputBuffers);
+    gpu->sim.outputBuffers              = gpu->sim.totalNonbondOutputBuffers;
+
+
    unsigned int outputBuffers = gpu->sim.totalNonbondOutputBuffers;
    for (unsigned int i = 0; i < gpu->sim.paddedNumberOfAtoms; i++)
    {
@@ -1715,30 +1810,59 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
    const unsigned int grid = gpu->grid;
    const unsigned int dim = (atoms + (grid - 1)) / grid;
    const unsigned int cells = dim * (dim + 1) / 2;
-    const unsigned int* pExclusion = gpu->pExclusion;
    CUDAStream<unsigned int>* psWorkUnit = new CUDAStream<unsigned int>(cells, 1u);
    unsigned int* pWorkList = psWorkUnit->_pSysStream[0];
    gpu->psWorkUnit = psWorkUnit;
    gpu->sim.pWorkUnit = psWorkUnit->_pDevStream[0];
+    CUDAStream<unsigned int>* psInteractingWorkUnit = new CUDAStream<unsigned int>(cells, 1u);
+    gpu->psInteractingWorkUnit = psInteractingWorkUnit;
+    gpu->sim.pInteractingWorkUnit = psInteractingWorkUnit->_pDevStream[0];
+    CUDAStream<unsigned int>* psInteractionFlag = new CUDAStream<unsigned int>(cells, 1u);
+    gpu->psInteractionFlag = psInteractionFlag;
+    gpu->sim.pInteractionFlag = psInteractionFlag->_pDevStream[0];
+    CUDAStream<size_t>* psInteractionCount = new CUDAStream<size_t>(1, 1u);
+    gpu->psInteractionCount = psInteractionCount;
+    gpu->sim.pInteractionCount = psInteractionCount->_pDevStream[0];
+    CUDAStream<float4>* psGridBoundingBox = new CUDAStream<float4>(dim, 1u);
+    gpu->psGridBoundingBox = psGridBoundingBox;
+    gpu->sim.pGridBoundingBox = psGridBoundingBox->_pDevStream[0];
+    CUDAStream<float4>* psGridCenter = new CUDAStream<float4>(dim, 1u);
+    gpu->psGridCenter = psGridCenter;
+    gpu->sim.pGridCenter = psGridCenter->_pDevStream[0];
    gpu->sim.nonbond_workBlock      = gpu->sim.nonbond_threads_per_block / GRID;
    gpu->sim.bornForce2_workBlock   = gpu->sim.bornForce2_threads_per_block / GRID;
    gpu->sim.workUnits = cells;

-    // Increase block count if necessary for extra large molecules that would
-    // otherwise overflow the SM workunit buffers
-    int minimumBlocks = (cells + gpu->sim.workUnitsPerSM - 1) / gpu->sim.workUnitsPerSM;
-    if ((int) gpu->sim.nonbond_blocks < minimumBlocks)
+    // Initialize the CUDPP workspace.
+    gpu->cudpp = 0;
+    CUDPPConfiguration config;
+    config.datatype = CUDPP_UINT;
+    config.algorithm = CUDPP_COMPACT;
+    config.options = CUDPP_OPTION_FORWARD;
+    CUDPPResult result = cudppPlan(&gpu->cudpp, config, cells, 1, 0);
+    if (CUDPP_SUCCESS != result)
    {
-        gpu->sim.nonbond_blocks = gpu->sim.nonbond_blocks * ((minimumBlocks + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks);
-    }
-    if ((int) gpu->sim.bornForce2_blocks < minimumBlocks)
-    {
-        gpu->sim.bornForce2_blocks = gpu->sim.bornForce2_blocks * ((minimumBlocks + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks);
+        printf("Error initializing CUDPP: %d\n", result);
+        exit(-1);
    }
+
+    // Increase block count if necessary for extra large molecules that would
+    // otherwise overflow the SM workunit buffers
+//    int minimumBlocks = (cells + gpu->sim.workUnitsPerSM - 1) / gpu->sim.workUnitsPerSM;
+//    if ((int) gpu->sim.nonbond_blocks < minimumBlocks)
+//    {
+//        gpu->sim.nonbond_blocks = gpu->sim.nonbond_blocks * ((minimumBlocks + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks);
+//    }
+//    if ((int) gpu->sim.bornForce2_blocks < minimumBlocks)
+//    {
+//        gpu->sim.bornForce2_blocks = gpu->sim.bornForce2_blocks * ((minimumBlocks + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks);
+//    }
    gpu->sim.nbWorkUnitsPerBlock            = cells / gpu->sim.nonbond_blocks;
    gpu->sim.nbWorkUnitsPerBlockRemainder   = cells - gpu->sim.nonbond_blocks * gpu->sim.nbWorkUnitsPerBlock;
    gpu->sim.bf2WorkUnitsPerBlock           = cells / gpu->sim.bornForce2_blocks;
    gpu->sim.bf2WorkUnitsPerBlockRemainder  = cells - gpu->sim.bornForce2_blocks * gpu->sim.bf2WorkUnitsPerBlock;
+    gpu->sim.interaction_threads_per_block = 64;
+    gpu->sim.interaction_blocks = (gpu->sim.workUnits + gpu->sim.interaction_threads_per_block - 1) / gpu->sim.interaction_threads_per_block;

    // Decrease thread count for extra small molecules to spread computation
    // across entire chip
@@ -1763,23 +1887,6 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
        for (unsigned int x = y; x < dim; x++)
        {
            pWorkList[count] = (x << 17) | (y << 2);
-
-            // Check for exclusions
-            int exclusions = 0;
-            for (unsigned int i = y * grid; i < y * grid + grid; i++)
-            {
-                for (unsigned int j = x * grid; j < x * grid + grid; j++)
-                {
-                    if (!pExclusion[i * atoms + j])
-                    {
-                        exclusions++;
-                    }
-                }
-            }
-
-            // Signal exclusions if they exist
-            if (exclusions > 0)
-                pWorkList[count] |= 0x1;
            count++;
        }
    }
@@ -1790,42 +1897,58 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
 }

 extern "C"
-int gpuBuildExclusionList(gpuContext gpu)
+void gpuBuildExclusionList(gpuContext gpu)
 {
-    unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
-    CUDAStream<unsigned int>* psExclusion = new CUDAStream<unsigned int>(atoms * atoms, 1u);
+    const unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
+    const unsigned int grid = gpu->grid;
+    const unsigned int dim = (atoms+(grid-1))/grid;
+    CUDAStream<unsigned int>* psExclusion = new CUDAStream<unsigned int>((atoms*atoms+grid-1) / grid, 1u);
    gpu->psExclusion = psExclusion;
    gpu->sim.pExclusion = psExclusion->_pDevStream[0];
    unsigned int* pExList = psExclusion->_pSysStream[0];
-    int exclusions = 0;
-    unsigned int pos = 0;    
+    unsigned int* pWorkList = gpu->psWorkUnit->_pSysStream[0];
+    for (int i = 0; i < psExclusion->_length; ++i)
+        pExList[i] = 0xFFFFFFFF;
+
+    // Fill in the exclusions.

-    for (unsigned int x = 0; x < atoms; x += gpu->grid)
+    for (int atom1 = 0; atom1 < gpu->exclusions.size(); ++atom1)
    {
-        for (unsigned int y = 0; y < atoms; y += gpu->grid)
-        {       
-            for (unsigned x1 = x; x1 < x + gpu->grid; x1++)
-            {
-                unsigned int mask = 0;
-                for (unsigned int y1 = y ; y1 < y + gpu->grid; y1++)
-                {
-                    mask >>= 1;
-                    if (gpu->pExclusion[x1 * atoms + y1] == 0)
-                    {
-                        if (x1 >= y1)
-                            exclusions++;
-                    }
-                    else
-                        mask |= 0x80000000;
-                }
-                pExList[pos++] = mask;
-            }
+        int x = atom1/grid;
+        int offset = atom1-x*grid;
+        for (int j = 0; j < gpu->exclusions[atom1].size(); ++j)
+        {
+            int atom2 = gpu->exclusions[atom1][j];
+            int y = atom2/grid;
+            int index = x*atoms+y*grid+offset;
+            pExList[index] &= 0xFFFFFFFF-(1<<(atom2-y*grid));
+            int cell = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
+            pWorkList[cell] |= 1;
+        }
+    }
+
+    // Mark all interactions that involve a padding atom as being excluded.
+
+    for (int atom1 = gpu->natoms; atom1 < atoms; ++atom1)
+    {
+        int x = atom1/grid;
+        int offset1 = atom1-x*grid;
+        for (int atom2 = 0; atom2 < atoms; ++atom2)
+        {
+            int y = atom2/grid;
+            int index = x*atoms+y*grid+offset1;
+            pExList[index] &= 0xFFFFFFFF-(1<<(atom2-y*grid));
+            int offset2 = atom2-y*grid;
+            index = y*atoms+x*grid+offset2;
+            pExList[index] &= 0xFFFFFFFF-(1<<(atom1-x*grid));
+            int cell = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
+            pWorkList[cell] |= 1;
        }
    }
    
    psExclusion->Upload();
+    gpu->psWorkUnit->Upload();
    gpuSetConstants(gpu);
-    return exclusions;
 }

 extern "C"
@@ -1842,14 +1965,12 @@ int gpuSetConstants(gpuContext gpu)
    SetUpdateShakeHSim(gpu);
    SetVerletUpdateSim(gpu);
    SetBrownianUpdateSim(gpu);
+    SetSettleSim(gpu);
    SetRandomSim(gpu);

    if (gpu->sm_version >= SM_12)
    {
-        SetCalculateCDLJForces_12Sim(gpu);
-        SetCalculateCDLJObcGbsaForces1_12Sim(gpu);
        SetCalculateObcGbsaForces1_12Sim(gpu);
-        SetCalculateObcGbsaForces2_12Sim(gpu);
    }

    return 1;
@@ -2705,3 +2826,329 @@ void gpuDumpObcLoop1(gpuContext gpu)
        );
    }
 }
+
+static void tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds)
+{
+    // Recursively tag atoms as belonging to a particular molecule.
+
+    atomMolecule[atom] = molecule;
+    for (int i = 0; i < atomBonds[atom].size(); i++)
+        if (atomMolecule[atomBonds[atom][i]] == -1)
+            tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
+}
+
+static void findMoleculeGroups(gpuContext gpu)
+{
+    // First make a list of constraints for future use.
+
+    vector<Constraint> constraints;
+    for (int i = 0; i < gpu->sim.ShakeConstraints; i++)
+    {
+        int atom1 = gpu->psShakeID->_pSysData[i].x;
+        int atom2 = gpu->psShakeID->_pSysData[i].y;
+        int atom3 = gpu->psShakeID->_pSysData[i].z;
+        int atom4 = gpu->psShakeID->_pSysData[i].w;
+        float distance2 = gpu->psShakeParameter->_pSysData[i].z;
+        constraints.push_back(Constraint(atom1, atom2, distance2));
+        if (atom3 != -1)
+            constraints.push_back(Constraint(atom1, atom3, distance2));
+        if (atom4 != -1)
+            constraints.push_back(Constraint(atom1, atom3, distance2));
+    }
+    for (int i = 0; i < gpu->sim.settleConstraints; i++)
+    {
+        int atom1 = gpu->psSettleID->_pSysData[i].x;
+        int atom2 = gpu->psSettleID->_pSysData[i].y;
+        int atom3 = gpu->psSettleID->_pSysData[i].z;
+        float distance12 = gpu->psSettleParameter->_pSysData[i].x;
+        float distance23 = gpu->psSettleParameter->_pSysData[i].y;
+        constraints.push_back(Constraint(atom1, atom2, distance12*distance12));
+        constraints.push_back(Constraint(atom1, atom3, distance12*distance12));
+        constraints.push_back(Constraint(atom2, atom3, distance23*distance23));
+    }
+
+    // First make a list of every other atom to which each atom is connect by a bond or constraint.
+
+    int numAtoms = gpu->natoms;
+    vector<vector<int> > atomBonds(numAtoms);
+    for (int i = 0; i < gpu->sim.bonds; i++)
+    {
+        int atom1 = gpu->psBondID->_pSysData[i].x;
+        int atom2 = gpu->psBondID->_pSysData[i].y;
+        atomBonds[atom1].push_back(atom2);
+        atomBonds[atom2].push_back(atom1);
+    }
+    for (int i = 0; i < constraints.size(); i++)
+    {
+        int atom1 = constraints[i].atom1;
+        int atom2 = constraints[i].atom2;
+        atomBonds[atom1].push_back(atom2);
+        atomBonds[atom2].push_back(atom1);
+    }
+
+    // Now tag atoms by which molecule they belong to.
+
+    vector<int> atomMolecule(numAtoms, -1);
+    int numMolecules = 0;
+    for (int i = 0; i < numAtoms; i++)
+        if (atomMolecule[i] == -1)
+            tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
+    vector<vector<int> > atomIndices(numMolecules);
+    for (int i = 0; i < numAtoms; i++)
+        atomIndices[atomMolecule[i]].push_back(i);
+
+    // Construct a description of each molecule.
+
+    vector<Molecule> molecules(numMolecules);
+    for (int i = 0; i < numMolecules; i++)
+        molecules[i].atoms = atomIndices[i];
+    for (int i = 0; i < gpu->sim.bonds; i++)
+    {
+        int atom1 = gpu->psBondID->_pSysData[i].x;
+        molecules[atomMolecule[atom1]].bonds.push_back(i);
+    }
+    for (int i = 0; i < gpu->sim.bond_angles; i++)
+    {
+        int atom1 = gpu->psBondAngleID1->_pSysData[i].x;
+        molecules[atomMolecule[atom1]].angles.push_back(i);
+    }
+    for (int i = 0; i < gpu->sim.dihedrals; i++)
+    {
+        int atom1 = gpu->psDihedralID1->_pSysData[i].x;
+        molecules[atomMolecule[atom1]].periodicTorsions.push_back(i);
+    }
+    for (int i = 0; i < gpu->sim.rb_dihedrals; i++)
+    {
+        int atom1 = gpu->psRbDihedralID1->_pSysData[i].x;
+        molecules[atomMolecule[atom1]].rbTorsions.push_back(i);
+    }
+    for (int i = 0; i < constraints.size(); i++)
+    {
+        molecules[atomMolecule[constraints[i].atom1]].constraints.push_back(i);
+    }
+
+    // Sort them into groups of identical molecules.
+
+    vector<Molecule> uniqueMolecules;
+    vector<vector<int> > moleculeInstances;
+    for (int molIndex = 0; molIndex < molecules.size(); molIndex++)
+    {
+        Molecule& mol = molecules[molIndex];
+
+        // See if it is identical to another molecule.
+
+        bool isNew = true;
+        for (int j = 0; j < uniqueMolecules.size() && isNew; j++)
+        {
+            Molecule& mol2 = uniqueMolecules[j];
+            bool identical = true;
+            if (mol.atoms.size() != mol2.atoms.size() || mol.bonds.size() != mol2.bonds.size()
+                    || mol.angles.size() != mol2.angles.size() || mol.periodicTorsions.size() != mol2.periodicTorsions.size()
+                    || mol.rbTorsions.size() != mol2.rbTorsions.size() || mol.constraints.size() != mol2.constraints.size())
+                identical = false;
+            int atomOffset = mol2.atoms[0]-mol.atoms[0];
+            float4* posq = gpu->psPosq4->_pSysData;
+            float4* velm = gpu->psVelm4->_pSysData;
+            float2* sigeps = gpu->psSigEps2->_pSysData;
+            for (int i = 0; i < mol.atoms.size() && identical; i++)
+                if (mol.atoms[i] != mol2.atoms[i]-atomOffset || posq[mol.atoms[i]].w != posq[mol2.atoms[i]].w ||
+                        velm[mol.atoms[i]].w != velm[mol2.atoms[i]].w || sigeps[mol.atoms[i]].x != sigeps[mol2.atoms[i]].x ||
+                        sigeps[mol.atoms[i]].y != sigeps[mol2.atoms[i]].y)
+                    identical = false;
+            int4* bondID = gpu->psBondID->_pSysData;
+            float2* bondParam = gpu->psBondParameter->_pSysData;
+            for (int i = 0; i < mol.bonds.size() && identical; i++)
+                if (bondID[mol.bonds[i]].x != bondID[mol2.bonds[i]].x-atomOffset || bondID[mol.bonds[i]].y != bondID[mol2.bonds[i]].y-atomOffset ||
+                        bondParam[mol.bonds[i]].x != bondParam[mol2.bonds[i]].x || bondParam[mol.bonds[i]].y != bondParam[mol2.bonds[i]].y)
+                    identical = false;
+            int4* angleID = gpu->psBondAngleID1->_pSysData;
+            float2* angleParam = gpu->psBondAngleParameter->_pSysData;
+            for (int i = 0; i < mol.angles.size() && identical; i++)
+                if (angleID[mol.angles[i]].x != angleID[mol2.angles[i]].x-atomOffset ||
+                        angleID[mol.angles[i]].y != angleID[mol2.angles[i]].y-atomOffset ||
+                        angleID[mol.angles[i]].z != angleID[mol2.angles[i]].z-atomOffset ||
+                        angleParam[mol.angles[i]].x != angleParam[mol2.angles[i]].x ||
+                        angleParam[mol.angles[i]].y != angleParam[mol2.angles[i]].y)
+                    identical = false;
+            int4* periodicID = gpu->psDihedralID1->_pSysData;
+            float4* periodicParam = gpu->psDihedralParameter->_pSysData;
+            for (int i = 0; i < mol.periodicTorsions.size() && identical; i++)
+                if (periodicID[mol.periodicTorsions[i]].x != periodicID[mol2.periodicTorsions[i]].x-atomOffset ||
+                        periodicID[mol.periodicTorsions[i]].y != periodicID[mol2.periodicTorsions[i]].y-atomOffset ||
+                        periodicID[mol.periodicTorsions[i]].z != periodicID[mol2.periodicTorsions[i]].z-atomOffset ||
+                        periodicID[mol.periodicTorsions[i]].w != periodicID[mol2.periodicTorsions[i]].w-atomOffset ||
+                        periodicParam[mol.periodicTorsions[i]].x != periodicParam[mol2.periodicTorsions[i]].x ||
+                        periodicParam[mol.periodicTorsions[i]].y != periodicParam[mol2.periodicTorsions[i]].y ||
+                        periodicParam[mol.periodicTorsions[i]].z != periodicParam[mol2.periodicTorsions[i]].z)
+                    identical = false;
+            int4* rbID = gpu->psRbDihedralID1->_pSysData;
+            float4* rbParam1 = gpu->psRbDihedralParameter1->_pSysData;
+            float2* rbParam2 = gpu->psRbDihedralParameter2->_pSysData;
+            for (int i = 0; i < mol.rbTorsions.size() && identical; i++)
+                if (rbID[mol.rbTorsions[i]].x != rbID[mol2.rbTorsions[i]].x-atomOffset ||
+                        rbID[mol.rbTorsions[i]].y != rbID[mol2.rbTorsions[i]].y-atomOffset ||
+                        rbID[mol.rbTorsions[i]].z != rbID[mol2.rbTorsions[i]].z-atomOffset ||
+                        rbID[mol.rbTorsions[i]].w != rbID[mol2.rbTorsions[i]].w-atomOffset ||
+                        rbParam1[mol.rbTorsions[i]].x != rbParam1[mol2.rbTorsions[i]].x ||
+                        rbParam1[mol.rbTorsions[i]].y != rbParam1[mol2.rbTorsions[i]].y ||
+                        rbParam1[mol.rbTorsions[i]].z != rbParam1[mol2.rbTorsions[i]].z ||
+                        rbParam1[mol.rbTorsions[i]].w != rbParam1[mol2.rbTorsions[i]].w ||
+                        rbParam2[mol.rbTorsions[i]].x != rbParam2[mol2.rbTorsions[i]].x ||
+                        rbParam2[mol.rbTorsions[i]].y != rbParam2[mol2.rbTorsions[i]].y)
+                    identical = false;
+            for (int i = 0; i < mol.constraints.size() && identical; i++)
+                if (constraints[mol.constraints[i]].atom1 != constraints[mol2.constraints[i]].atom1-atomOffset ||
+                        constraints[mol.constraints[i]].atom2 != constraints[mol2.constraints[i]].atom2-atomOffset ||
+                        constraints[mol.constraints[i]].distance2 != constraints[mol2.constraints[i]].distance2)
+                    identical = false;
+            if (identical)
+            {
+                moleculeInstances[j].push_back(mol.atoms[0]);
+                isNew = false;
+            }
+        }
+        if (isNew)
+        {
+            uniqueMolecules.push_back(mol);
+            moleculeInstances.push_back(vector<int>());
+            moleculeInstances[moleculeInstances.size()-1].push_back(mol.atoms[0]);
+        }
+    }
+    gpu->moleculeGroups.resize(moleculeInstances.size());
+    for (int i = 0; i < moleculeInstances.size(); i++)
+    {
+        gpu->moleculeGroups[i].instances = moleculeInstances[i];
+        vector<int>& atoms = uniqueMolecules[i].atoms;
+        gpu->moleculeGroups[i].atoms.resize(atoms.size());
+        for (int j = 0; j < atoms.size(); j++)
+            gpu->moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
+    }
+}
+
+extern "C"
+void gpuReorderAtoms(gpuContext gpu)
+{
+    if (gpu->natoms == 0 || gpu->sim.nonbondedCutoffSqr == 0.0)
+        return;
+    if (gpu->moleculeGroups.size() == 0)
+        findMoleculeGroups(gpu);
+
+    // Find the range of positions and the number of bins along each axis.
+
+    int numAtoms = gpu->natoms;
+    gpu->psPosq4->Download();
+    gpu->psVelm4->Download();
+    float4* posq = gpu->psPosq4->_pSysData;
+    float4* velm = gpu->psVelm4->_pSysData;
+    float minx = posq[0].x, maxx = posq[0].x;
+    float miny = posq[0].y, maxy = posq[0].y;
+    float minz = posq[0].z, maxz = posq[0].z;
+    if (gpu->sim.nonbondedMethod == PERIODIC)
+    {
+        minx = miny = minz = 0.0;
+        maxx = gpu->sim.periodicBoxSizeX;
+        maxy = gpu->sim.periodicBoxSizeY;
+        maxz = gpu->sim.periodicBoxSizeZ;
+    }
+    else
+    {
+        for (int i = 1; i < numAtoms; i++)
+        {
+            minx = min(minx, posq[i].x);
+            maxx = max(maxx, posq[i].x);
+            miny = min(miny, posq[i].y);
+            maxy = max(maxy, posq[i].y);
+            minz = min(minz, posq[i].z);
+            maxz = max(maxz, posq[i].z);
+        }
+    }
+    float binWidth = 0.2*sqrt(gpu->sim.nonbondedCutoffSqr);
+    int xbins = 1 + (int) ((maxx-minx)/binWidth);
+    int ybins = 1 + (int) ((maxy-miny)/binWidth);
+    int zbins = 1 + (int) ((maxz-minz)/binWidth);
+
+    // Loop over each group of identical molecules and reorder them.
+
+    vector<int> originalIndex(numAtoms);
+    vector<float4> newPosq(numAtoms);
+    vector<float4> newVelm(numAtoms);
+    for (int group = 0; group < gpu->moleculeGroups.size(); group++)
+    {
+        // Find the center of each molecule.
+
+        gpuMoleculeGroup& mol = gpu->moleculeGroups[group];
+        int numMolecules = mol.instances.size();
+        vector<int>& atoms = mol.atoms;
+        vector<float3> molPos(numMolecules);
+        for (int i = 0; i < numMolecules; i++)
+        {
+            molPos[i].x = 0.0f;
+            molPos[i].y = 0.0f;
+            molPos[i].z = 0.0f;
+            for (int j = 0; j < atoms.size(); j++)
+            {
+                int atom = atoms[j]+mol.instances[i];
+                molPos[i].x += posq[atom].x;
+                molPos[i].y += posq[atom].y;
+                molPos[i].z += posq[atom].z;
+            }
+            molPos[i].x /= atoms.size();
+            molPos[i].y /= atoms.size();
+            molPos[i].z /= atoms.size();
+        }
+        if (gpu->sim.nonbondedMethod == PERIODIC)
+        {
+            // Move each molecule position into the same box.
+
+            for (int i = 0; i < numMolecules; i++)
+            {
+                molPos[i].x -= floor(molPos[i].x/gpu->sim.periodicBoxSizeX)*gpu->sim.periodicBoxSizeX;
+                molPos[i].y -= floor(molPos[i].y/gpu->sim.periodicBoxSizeY)*gpu->sim.periodicBoxSizeY;
+                molPos[i].z -= floor(molPos[i].z/gpu->sim.periodicBoxSizeZ)*gpu->sim.periodicBoxSizeZ;
+            }
+        }
+
+        // Select a bin for each molecule, then sort them by bin.
+
+        vector<pair<int, int> > molBins(numMolecules);
+        for (int i = 0; i < numMolecules; i++)
+        {
+            int x = (int) ((molPos[i].x-minx)/binWidth);
+            int y = (int) ((molPos[i].y-miny)/binWidth);
+            int z = (int) ((molPos[i].z-minz)/binWidth);
+            int yodd = y&1;
+            int zodd = z&1;
+            int bin = z*xbins*ybins;
+            bin += (zodd ? ybins-y : y)*xbins;
+            bin += (yodd ? xbins-x : x);
+            molBins[i] = pair<int, int>(bin, i);
+        }
+        sort(molBins.begin(), molBins.end());
+
+        // Reorder the atoms.
+
+        for (int i = 0; i < numMolecules; i++)
+        {
+            for (int j = 0; j < atoms.size(); j++)
+            {
+                int oldIndex = mol.instances[molBins[i].second]+atoms[j];
+                int newIndex = mol.instances[i]+atoms[j];
+                originalIndex[newIndex] = gpu->psAtomIndex->_pSysStream[0][oldIndex];
+                newPosq[newIndex] = posq[oldIndex];
+                newVelm[newIndex] = velm[oldIndex];
+            }
+        }
+    }
+
+    // Update the streams.
+
+    for (int i = 0; i < numAtoms; i++)
+        posq[i] = newPosq[i];
+    gpu->psPosq4->Upload();
+    for (int i = 0; i < numAtoms; i++)
+        velm[i] = newVelm[i];
+    gpu->psVelm4->Upload();
+    for (int i = 0; i < numAtoms; i++)
+        gpu->psAtomIndex->_pSysData[i] = originalIndex[i];
+    gpu->psAtomIndex->Upload();
+}
--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -33,14 +33,20 @@
 * -------------------------------------------------------------------------- */

 #include "cudatypes.h"
+#include "cudpp.h"
 #include <vector>

 struct gpuAtomType {
-    string name;
+    std::string name;
    char symbol;
    float r;
 };

+struct gpuMoleculeGroup {
+    std::vector<int> atoms;
+    std::vector<int> instances;
+};
+
 enum SM_VERSION
 {
    SM_10,
@@ -61,8 +67,9 @@ struct _gpuContext {
    int gAtomTypes;
    cudaGmxSimulation sim;
    unsigned int* pOutputBufferCounter;
-    unsigned int* pExclusion;
+    std::vector<std::vector<int> > exclusions;
    unsigned char* pAtomSymbol;
+    std::vector<gpuMoleculeGroup> moleculeGroups;
    float iterations;
    float epsfac;
    float solventDielectric;
@@ -70,9 +77,12 @@ struct _gpuContext {
    int grid;
    bool bCalculateCM;
    bool bRemoveCM;
-	 bool bRecalculateBornRadii;
+    bool bRecalculateBornRadii;
+    bool bOutputBufferPerWarp;
+    bool bIncludeGBSA;
    unsigned long seed;
    SM_VERSION sm_version;
+    CUDPPHandle cudpp;
    CUDAStream<float4>* psPosq4;
    CUDAStream<float4>* psPosqP4;
    CUDAStream<float4>* psOldPosq4;
@@ -103,15 +113,21 @@ struct _gpuContext {
    CUDAStream<int>* psNonShakeID;
    CUDAStream<int4>* psShakeID;
    CUDAStream<float4>* psShakeParameter;
+    CUDAStream<int4>* psSettleID;
+    CUDAStream<float2>* psSettleParameter;
    CUDAStream<unsigned int>* psExclusion;
    CUDAStream<unsigned int>* psWorkUnit;
+    CUDAStream<unsigned int>* psInteractingWorkUnit;
+    CUDAStream<unsigned int>* psInteractionFlag;
+    CUDAStream<size_t>* psInteractionCount;
    CUDAStream<float4>* psRandom4;          // Pointer to sets of 4 random numbers for MD integration
    CUDAStream<float2>* psRandom2;          // Pointer to sets of 2 random numbers for MD integration
    CUDAStream<uint4>* psRandomSeed;        // Pointer to each random seed
    CUDAStream<int>* psRandomPosition;      // Pointer to random number positions
    CUDAStream<float4>* psLinearMomentum;   // Pointer to total linear momentum per CTA
-   
-
+    CUDAStream<int>* psAtomIndex;           // The original index of each atom
+    CUDAStream<float4>* psGridBoundingBox;  // The size of each grid cell
+    CUDAStream<float4>* psGridCenter;       // The center and radius for each grid cell
 };

 typedef struct _gpuContext *gpuContext;
@@ -156,10 +172,10 @@ void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std::
        const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q1, const std::vector<float>& q2);

 extern "C"
-float gpuGetAtomicRadius(gpuContext gpu, string s);
+float gpuGetAtomicRadius(gpuContext gpu, std::string s);

 extern "C"
-unsigned char gpuGetAtomicSymbol(gpuContext gpu, string s);
+unsigned char gpuGetAtomicSymbol(gpuContext gpu, std::string s);

 extern "C"
 int gpuReadAtomicParameters(gpuContext gpu, char* fname);
@@ -169,7 +185,13 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname);

 extern "C"
 void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q,
-        const std::vector<char>& symbol, const std::vector<vector<int> >& exclusions);
+        const std::vector<char>& symbol, const std::vector<std::vector<int> >& exclusions, CudaNonbondedMethod method);
+
+extern "C"
+void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDielectric);
+
+extern "C"
+void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize);

 extern "C"
 void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius, const std::vector<float>& scale);
@@ -227,7 +249,7 @@ extern "C"
 int gpuBuildThreadBlockWorkList(gpuContext gpu);

 extern "C"
-int gpuBuildExclusionList(gpuContext gpu);
+void gpuBuildExclusionList(gpuContext gpu);

 extern "C"
 int gpuSetConstants(gpuContext gpu);
@@ -274,4 +296,7 @@ void gpuDumpObcInfo(gpuContext gpu);
 extern "C"
 void gpuDumpObcLoop1(gpuContext gpu); 

+extern "C"
+void gpuReorderAtoms(gpuContext gpu);
+
 #endif //__GPUTYPES_H__
--- a/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
@@ -54,335 +54,120 @@ struct Atom {
    float fx;
    float fy;
    float fz;
-    float eps2;
-    float sig2;
 };

-
-__shared__ Atom sA[G8X_NONBOND_THREADS_PER_BLOCK];
-__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
-__shared__ unsigned int sNext[GRID];
-
 static __constant__ cudaGmxSimulation cSim;

 void SetCalculateCDLJForcesSim(gpuContext gpu)
 {
    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
 }

 void GetCalculateCDLJForcesSim(gpuContext gpu)
 {
    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
 }

-__global__ void kCalculateCDLJForces_kernel()
+// Include versions of the kernels for N^2 calculations.
+
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateCDLJForces.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateCDLJForces.h"
+
+// Include versions of the kernels with cutoffs.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_CUTOFF
+#define METHOD_NAME(a, b) a##Cutoff##b
+#include "kCalculateCDLJForces.h"
+#include "kFindInteractingBlocks.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
+#include "kCalculateCDLJForces.h"
+
+// Include versions of the kernels with periodic boundary conditions.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_PERIODIC
+#define METHOD_NAME(a, b) a##Periodic##b
+#include "kCalculateCDLJForces.h"
+#include "kFindInteractingBlocks.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##PeriodicByWarp##b
+#include "kCalculateCDLJForces.h"
+
+
+__global__ extern void kCalculateCDLJCutoffForces_12_kernel();
+
+void kCalculateCDLJForces(gpuContext gpu)
 {
-    // Read queue of work blocks once so the remainder of
-    // kernel can run asynchronously    
-    int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
-    int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);    
-    if (threadIdx.x < end - pos)
+//    printf("kCalculateCDLJCutoffForces\n");
+    CUDPPResult result;
+    size_t numWithInteractions;
+    switch (gpu->sim.nonbondedMethod)
    {
-        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
-    }
-    if (threadIdx.x < GRID)
-    {
-        sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
-    }
-    __syncthreads();
-
-    // Now change pos and end to reflect work queue just read
-    // into shared memory
-    end = end - pos; 
-    pos = end - (threadIdx.x >> GRIDBITS) - 1;
-       
-    while (pos >= 0)
-    {  
-    
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x = sWorkUnit[pos];
-        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
-        bool bExclusionFlag = (x & 0x1);
-        x = (x >> 17) << GRIDBITS;
-        float4      apos;   // Local atom x, y, z, q
-        float3      af;     // Local atom fx, fy, fz
-        float dx; 
-        float dy; 
-        float dz; 
-        float r2; 
-        float invR; 
-        float sig; 
-        float sig2; 
-        float sig6; 
-        float eps; 
-        float dEdR;  
-        unsigned int tgx = threadIdx.x & (GRID - 1);
-        unsigned int tbx = threadIdx.x - tgx;
-        int tj = tgx; 
-        Atom* psA = &sA[tbx];
-        if (!bExclusionFlag)
-        {
-            if (x == y) // Handle diagonals uniquely at 50% efficiency
-            { 
-                // Read fixed atom data into registers and GRF
-                unsigned int i      = x + tgx;
-                apos                = cSim.pPosq[i];
-                float2 a            = cSim.pAttr[i];
-                sA[threadIdx.x].x   = apos.x;
-                sA[threadIdx.x].y   = apos.y;
-                sA[threadIdx.x].z   = apos.z;
-                sA[threadIdx.x].q   = apos.w;
-                sA[threadIdx.x].sig = a.x;
-                sA[threadIdx.x].eps = a.y;
-                af.x                = 0.0f;
-                af.y                = 0.0f;
-                af.z                = 0.0f;
-                apos.w             *= cSim.epsfac;
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    dx              = psA[j].x - apos.x; 
-                    dy              = psA[j].y - apos.y; 
-                    dz              = psA[j].z - apos.z; 
-                    r2              = dx * dx + dy * dy + dz * dz; 
-                    invR            = 1.0f / sqrt(r2);
-                    sig             = a.x + psA[j].sig; 
-                    sig2            = invR * sig; 
-                    sig2           *= sig2;
-                    sig6            = sig2 * sig2 * sig2; 
-                    eps             = a.y * psA[j].eps; 
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR           += apos.w * psA[j].q * invR; 
-                    dEdR           *= invR * invR; 
-                    dx             *= dEdR; 
-                    dy             *= dEdR; 
-                    dz             *= dEdR; 
-                    af.x           -= dx; 
-                    af.y           -= dy; 
-                    af.z           -= dz; 
-                }
-                
-                // Write results
-                float4 of;
-                of.x                                = af.x;
-                of.y                                = af.y;
-                of.z                                = af.z;
-                of.w                                = 0.0f;
-                int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]               = of;
-            }         
-            else        // 100% utilization
+        case NO_CUTOFF:
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCDLJN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
+            else
+                kCalculateCDLJN2Forces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
+            LAUNCHERROR("kCalculateCDLJN2Forces");
+            break;
+        case CUTOFF:
+            kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsCutoff");
+            kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
+            result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
+                    gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
+            if (result != CUDPP_SUCCESS)
            {
-                // Read fixed atom data into registers and GRF
-                int j                   = y + tgx;
-                unsigned int i          = x + tgx;
-                float4 temp             = cSim.pPosq[j];
-                float2 temp1            = cSim.pAttr[j];
-                apos                    = cSim.pPosq[i];
-                float2 a                = cSim.pAttr[i];
-                sA[threadIdx.x].x       = temp.x;
-                sA[threadIdx.x].y       = temp.y;
-                sA[threadIdx.x].z       = temp.z;
-                sA[threadIdx.x].q       = temp.w;
-                sA[threadIdx.x].sig     = temp1.x;
-                sA[threadIdx.x].eps     = temp1.y;
-                sA[threadIdx.x].fx      = af.x = 0.0f;
-                sA[threadIdx.x].fy      = af.y = 0.0f;
-                sA[threadIdx.x].fz      = af.z = 0.0f;
-                sA[threadIdx.x].sig2    = a.x;
-                sA[threadIdx.x].eps2    = a.y;
-                apos.w                 *= cSim.epsfac;
-                
-                for (j = 0; j < GRID; j++)
-                {
-                    dx              = psA[tj].x - apos.x; 
-                    dy              = psA[tj].y - apos.y; 
-                    dz              = psA[tj].z - apos.z; 
-                    r2              = dx * dx + dy * dy + dz * dz; 
-                    invR            = 1.0f / sqrt(r2);
-                    sig             = a.x + psA[tj].sig; 
-                    sig2            = invR * sig; 
-                    sig2           *= sig2;
-                    sig6            = sig2 * sig2 * sig2; 
-                    eps             = a.y * psA[tj].eps; 
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR           += apos.w * psA[tj].q * invR; 
-                    dEdR           *= invR * invR; 
-                    dx             *= dEdR; 
-                    dy             *= dEdR; 
-                    dz             *= dEdR; 
-                    af.x           -= dx; 
-                    af.y           -= dy; 
-                    af.z           -= dz; 
-                    psA[tj].fx     += dx; 
-                    psA[tj].fy     += dy; 
-                    psA[tj].fz     += dz;
-                    tj              = sNext[tj]; 
-                }
-                
-                // Write results
-                float4 of;
-                of.x                                = af.x;
-                of.y                                = af.y;
-                of.z                                = af.z;
-                of.w                                = 0.0f;
-                int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]               = of;
-                of.x                                = sA[threadIdx.x].fx;
-                of.y                                = sA[threadIdx.x].fy;
-                of.z                                = sA[threadIdx.x].fz;
-                offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]               = of;
+                printf("Error in cudppCompact: %d\n", result);
+                exit(-1);
            }
-        }
-        else  // bExclusion
-        {
-            // Read exclusion data
-            
-            if (x == y) // Handle diagonals uniquely at 50% efficiency
-            { 
-                // Read fixed atom data into registers and GRF
-                unsigned int excl       = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];                          
-                unsigned int i          = x + tgx;
-                apos                    = cSim.pPosq[i];
-                float2 a                = cSim.pAttr[i];
-                sA[threadIdx.x].x       = apos.x;
-                sA[threadIdx.x].y       = apos.y;
-                sA[threadIdx.x].z       = apos.z;
-                sA[threadIdx.x].q       = apos.w;
-                sA[threadIdx.x].sig     = a.x;
-                sA[threadIdx.x].eps     = a.y;
-                af.x                    = 0.0f;
-                af.y                    = 0.0f;
-                af.z                    = 0.0f;
-                sA[threadIdx.x].sig2    = a.x;
-                sA[threadIdx.x].eps2    = a.y;
-                apos.w                 *= cSim.epsfac;
-                
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    dx              = psA[j].x - apos.x; 
-                    dy              = psA[j].y - apos.y; 
-                    dz              = psA[j].z - apos.z; 
-                    r2              = dx * dx + dy * dy + dz * dz; 
-                    invR            = 1.0f / sqrt(r2);
-                    sig             = psA[tgx].sig2 + psA[j].sig; 
-                    sig2            = invR * sig; 
-                    sig2           *= sig2;
-                    sig6            = sig2 * sig2 * sig2; 
-                    eps             = psA[tgx].eps2 * psA[j].eps; 
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR           += apos.w * psA[j].q * invR; 
-                    dEdR           *= invR * invR; 
-                    if (!(excl & 0x1))
-                    {
-                        dEdR = 0.0f;
-                    }
-                    dx             *= dEdR; 
-                    dy             *= dEdR; 
-                    dz             *= dEdR; 
-                    af.x           -= dx; 
-                    af.y           -= dy; 
-                    af.z           -= dz;
-                    excl          >>= 1;               
-                }
-                
-                // Write results
-                float4 of;
-                of.x                                = af.x;
-                of.y                                = af.y;
-                of.z                                = af.z;
-                of.w                                = 0.0f;
-                int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]               = of;
-            }         
-            else        // 100% utilization
+            gpu->psInteractionCount->Download();
+            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCDLJCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            else
+                kCalculateCDLJCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            LAUNCHERROR("kCalculateCDLJCutoffForces");
+            break;
+        case PERIODIC:
+            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsPeriodic");
+            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
+            result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
+                    gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
+            if (result != CUDPP_SUCCESS)
            {
-                // Read fixed atom data into registers and GRF        
-                unsigned int excl       = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
-                excl                    = (excl >> tgx) | (excl << (GRID - tgx));
-                int j                   = y + tgx;
-                unsigned int i          = x + tgx;
-                float4 temp             = cSim.pPosq[j];
-                float2 temp1            = cSim.pAttr[j];
-                apos                    = cSim.pPosq[i];
-                float2 a                = cSim.pAttr[i];
-                sA[threadIdx.x].x       = temp.x;
-                sA[threadIdx.x].y       = temp.y;
-                sA[threadIdx.x].z       = temp.z;
-                sA[threadIdx.x].q       = temp.w;
-                sA[threadIdx.x].sig     = temp1.x;
-                sA[threadIdx.x].eps     = temp1.y;
-                sA[threadIdx.x].fx      = af.x = 0.0f;
-                sA[threadIdx.x].fy      = af.y = 0.0f;
-                sA[threadIdx.x].fz      = af.z = 0.0f;
-                sA[threadIdx.x].sig2    = a.x;
-                sA[threadIdx.x].eps2    = a.y;
-                apos.w                 *= cSim.epsfac;
-                
-                for (j = 0; j < GRID; j++)
-                {
-                    dx              = psA[tj].x - apos.x; 
-                    dy              = psA[tj].y - apos.y; 
-                    dz              = psA[tj].z - apos.z; 
-                    r2              = dx * dx + dy * dy + dz * dz; 
-                    invR            = 1.0f / sqrt(r2);
-                    sig             = psA[tgx].sig2 + psA[tj].sig; 
-                    sig2            = invR * sig; 
-                    sig2           *= sig2;
-                    sig6            = sig2 * sig2 * sig2; 
-                    eps             = psA[tgx].eps2 * psA[tj].eps; 
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR           += apos.w * psA[tj].q * invR; 
-                    dEdR           *= invR * invR; 
-                    if (!(excl & 0x1))
-                    {
-                        dEdR = 0.0f;
-                    }
-                    dx             *= dEdR; 
-                    dy             *= dEdR; 
-                    dz             *= dEdR; 
-                    af.x           -= dx; 
-                    af.y           -= dy; 
-                    af.z           -= dz; 
-                    psA[tj].fx     += dx; 
-                    psA[tj].fy     += dy; 
-                    psA[tj].fz     += dz;
-                    excl          >>= 1;
-                    tj              = sNext[tj]; 
-                }
-                
-                // Write results
-                float4 of;
-                of.x                                = af.x;
-                of.y                                = af.y;
-                of.z                                = af.z;
-                of.w                                = 0.0f;
-                int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]               = of;
-                of.x                                = sA[threadIdx.x].fx;
-                of.y                                = sA[threadIdx.x].fy;
-                of.z                                = sA[threadIdx.x].fz;
-                offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]               = of;
+                printf("Error in cudppCompact: %d\n", result);
+                exit(-1);
            }
-        }
-
-        pos -= cSim.nonbond_workBlock;     
+            gpu->psInteractionCount->Download();
+            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            else
+                kCalculateCDLJPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            LAUNCHERROR("kCalculateCDLJPeriodicForces");
    }
-}
-
-__global__ extern void kCalculateCDLJForces_12_kernel();
-
-void kCalculateCDLJForces(gpuContext gpu)
-{
-//    printf("kCalculateCDLJForces\n");
-    if (gpu->sm_version < SM_12)
-        kCalculateCDLJForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
-    else
-        kCalculateCDLJForces_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
-    LAUNCHERROR("kCalculateCDLJForces");
 }
\ No newline at end of file
--- a/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
@@ -33,9 +33,6 @@
 #include <cuda.h>
 #include <vector_functions.h>
 #include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
 using namespace std;

 #include "gputypes.h"
@@ -54,15 +51,8 @@ struct Atom {
    float fy;
    float fz;
    float fb;
-    float q2;
-    float junk;
 };

-
-__shared__ Atom sA[G8X_NONBOND_THREADS_PER_BLOCK];
-__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
-__shared__ unsigned int sNext[GRID];
-
 static __constant__ cudaGmxSimulation cSim;

 void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
@@ -79,376 +69,122 @@ void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
 }

-__global__ void kCalculateCDLJObcGbsaForces1_kernel()
+// Include versions of the kernel for N^2 calculations.
+
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateCDLJObcGbsaForces1.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateCDLJObcGbsaForces1.h"
+
+// Include versions of the kernel with cutoffs.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_CUTOFF
+#define METHOD_NAME(a, b) a##Cutoff##b
+#include "kCalculateCDLJObcGbsaForces1.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
+#include "kCalculateCDLJObcGbsaForces1.h"
+
+// Include versions of the kernel with periodic boundary conditions.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_PERIODIC
+#define METHOD_NAME(a, b) a##Periodic##b
+#include "kCalculateCDLJObcGbsaForces1.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##PeriodicByWarp##b
+#include "kCalculateCDLJObcGbsaForces1.h"
+
+extern __global__ void kFindBlockBoundsCutoff_kernel();
+extern __global__ void kFindBlockBoundsPeriodic_kernel();
+extern __global__ void kFindBlocksWithInteractionsCutoff_kernel();
+extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
+
+void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
 {
-    // Read queue of work blocks once so the remainder of
-    // kernel can run asynchronously    
-    int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
-    int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);    
-    if (threadIdx.x < end - pos)
-    {
-        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
-    }
-    if (threadIdx.x < GRID)
-    {
-        sNext[threadIdx.x]                  = (threadIdx.x + 1) & (GRID - 1);
-    }
-    __syncthreads();
+//    printf("kCalculateCDLJObcGbsaForces1\n");

-    // Now change pos and end to reflect work queue just read
-    // into shared memory
-    end                                     = end - pos; 
-    pos                                     = end - (threadIdx.x >> GRIDBITS) - 1;
-       
-    while (pos >= 0)
-    {  
-    
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x                      = sWorkUnit[pos];
-        unsigned int y                      = ((x >> 2) & 0x7fff) << GRIDBITS;
-        bool bExclusionFlag                 = (x & 0x1);
-        x                                   = (x >> 17) << GRIDBITS;
-        unsigned int tgx                    = threadIdx.x & (GRID - 1);
-        unsigned int i                      = x + tgx;
-        float4 apos                         = cSim.pPosq[i];
-        float2 a                            = cSim.pAttr[i];
-        float br                            = cSim.pBornRadii[i];        
-        unsigned int tbx                    = threadIdx.x - tgx;
-        int tj                              = tgx; 
-        Atom* psA                           = &sA[tbx];
-        if (!bExclusionFlag)
-        {
-            if (x == y) // Handle diagonals uniquely at 50% efficiency
-            { 
-                // Read fixed atom data into registers and GRF
-                sA[threadIdx.x].x           = apos.x;
-                sA[threadIdx.x].y           = apos.y;
-                sA[threadIdx.x].z           = apos.z;
-                sA[threadIdx.x].q           = cSim.epsfac * apos.w;
-                sA[threadIdx.x].q2          = cSim.preFactor * apos.w;
-                sA[threadIdx.x].sig         = a.x;
-                sA[threadIdx.x].eps         = a.y;
-                sA[threadIdx.x].br          = br; 
-                float4 af;
-                af.x                        = 0.0f;
-                af.y                        = 0.0f;
-                af.z                        = 0.0f;
-                af.w                        = 0.0f;
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float dx                = psA[j].x - apos.x; 
-                    float dy                = psA[j].y - apos.y; 
-                    float dz                = psA[j].z - apos.z; 
-                    float r2                = dx * dx + dy * dy + dz * dz; 
-                                     
-                    // CDLJ part
-						  
-                    float invR              = 1.0f / sqrt(r2);
-                    float sig               = a.x + psA[j].sig; 
-                    float sig2              = invR * sig; 
-                    sig2                   *= sig2;
-                    float sig6              = sig2 * sig2 * sig2; 
-                    float eps               = a.y * psA[j].eps; 
-                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR                   += apos.w * psA[j].q * invR; 
-                    dEdR                   *= invR * invR; 
-						 
-//float dEdR = 0.0f;
+    // check if Born radii need to be calculated

-                    // ObcGbsaForce1 part
-                    float alpha2_ij         = br * psA[j].br;
-                    float D_ij              = r2 / (4.0f * alpha2_ij);
-                    float expTerm           = exp(-D_ij);
-                    float denominator2      = r2 + alpha2_ij * expTerm;
-                    float denominator       = sqrt(denominator2);
-                    float Gpol              = (apos.w * psA[j].q2) / (denominator * denominator2);
-                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
-                    af.w                   += dGpol_dalpha2_ij * psA[j].br;   
-                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm); 
-                    
-                    // Add Forces
-                    dx                     *= dEdR; 
-                    dy                     *= dEdR; 
-                    dz                     *= dEdR; 
-                    af.x                   -= dx; 
-                    af.y                   -= dy; 
-                    af.z                   -= dz; 
-                }
-                
-                // Write results
-                int offset                  = x + tgx + (x >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]       = af;
-                cSim.pBornForce[offset]     = af.w;
-            }         
-            else        // 100% utilization
+    kClearBornForces(gpu);
+    CUDPPResult result;
+    size_t numWithInteractions;
+    switch (gpu->sim.nonbondedMethod)
+    {
+        case NO_CUTOFF:
+            if (gpu->bRecalculateBornRadii)
            {
-                // Read fixed atom data into registers and GRF
-                int j                       = y + tgx;
-                float4 temp                 = cSim.pPosq[j];
-                float2 temp1                = cSim.pAttr[j];
-                sA[threadIdx.x].br          = cSim.pBornRadii[j];
-                float4 af;
-                sA[threadIdx.x].fx          = af.x = 0.0f;
-                sA[threadIdx.x].fy          = af.y = 0.0f;
-                sA[threadIdx.x].fz          = af.z = 0.0f;
-                sA[threadIdx.x].fb          = af.w = 0.0f;      
-                sA[threadIdx.x].x           = temp.x;
-                sA[threadIdx.x].y           = temp.y;
-                sA[threadIdx.x].z           = temp.z;
-                sA[threadIdx.x].q           = cSim.epsfac * temp.w;
-                sA[threadIdx.x].q2          = cSim.preFactor * temp.w;
-                sA[threadIdx.x].sig         = temp1.x;
-                sA[threadIdx.x].eps         = temp1.y;
-   
-                for (j = 0; j < GRID; j++)
-                {
-                    float dx                = psA[tj].x - apos.x; 
-                    float dy                = psA[tj].y - apos.y; 
-                    float dz                = psA[tj].z - apos.z; 
-                    float r2                = dx * dx + dy * dy + dz * dz; 
-                    
-                    // CDLJ part
-						  
-                    float invR              = 1.0f / sqrt(r2);
-                    float sig               = a.x + psA[tj].sig; 
-                    float sig2              = invR * sig; 
-                    sig2                   *= sig2;
-                    float sig6              = sig2 * sig2 * sig2; 
-                    float eps               = a.y * psA[tj].eps; 
-                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR                   += apos.w * psA[tj].q * invR; 
-                    dEdR                   *= invR * invR; 
-                  
-//float dEdR = 0.0f;
-                    // ObcGbsaForce1 part
-                    float alpha2_ij         = br * psA[tj].br;
-                    float D_ij              = r2 / (4.0f * alpha2_ij);
-                    float expTerm           = exp(-D_ij);
-                    float denominator2      = r2 + alpha2_ij * expTerm;
-                    float denominator       = sqrt(denominator2);
-                    float Gpol              = (apos.w * psA[tj].q2) / (denominator * denominator2);
-                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
-                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;  
-                    psA[tj].fb             += dGpol_dalpha2_ij * br;      
-                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm); 
-                    
-                    // Add forces
-                    dx                     *= dEdR; 
-                    dy                     *= dEdR; 
-                    dz                     *= dEdR; 
-                    af.x                   -= dx; 
-                    af.y                   -= dy; 
-                    af.z                   -= dz;    
-                    psA[tj].fx             += dx; 
-                    psA[tj].fy             += dy; 
-                    psA[tj].fz             += dz;
-                    tj                      = sNext[tj]; 
-                }
-                
-                // Write results
-                int offset                  = x + tgx + (y >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]       = af;
-                cSim.pBornForce[offset]     = af.w;
-                af.x                        = sA[threadIdx.x].fx;
-                af.y                        = sA[threadIdx.x].fy;
-                af.z                        = sA[threadIdx.x].fz;
-                offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]       = af;
-                cSim.pBornForce[offset]     = sA[threadIdx.x].fb;
+                kCalculateObcGbsaBornSum(gpu);
+                kReduceObcGbsaBornSum(gpu);
            }
-        }
-        else  // bExclusion
-        {
-            // Read exclusion data
-            
-            if (x == y) // Handle diagonals uniquely at 50% efficiency
-            { 
-                // Read fixed atom data into registers and GRF
-                unsigned int excl           = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
-                float4 af;
-                af.x                        = 0.0f;
-                af.y                        = 0.0f;
-                af.z                        = 0.0f;
-                af.w                        = 0.0f;                                      
-                sA[threadIdx.x].x           = apos.x;
-                sA[threadIdx.x].y           = apos.y;
-                sA[threadIdx.x].z           = apos.z;
-                sA[threadIdx.x].q           = cSim.epsfac * apos.w;
-                sA[threadIdx.x].q2          = cSim.preFactor * apos.w;
-                sA[threadIdx.x].sig         = a.x;
-                sA[threadIdx.x].eps         = a.y;
-                sA[threadIdx.x].br          = br;
-
-                
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float dx                = psA[j].x - apos.x; 
-                    float dy                = psA[j].y - apos.y; 
-                    float dz                = psA[j].z - apos.z; 
-                    float r2                = dx * dx + dy * dy + dz * dz; 
-                   
-                    // CDLJ part
-						  
-                    float invR              = 1.0f / sqrt(r2);
-                    float sig               = a.x + psA[j].sig; 
-                    float sig2              = invR * sig; 
-                    sig2                   *= sig2;
-                    float sig6              = sig2 * sig2 * sig2; 
-                    float eps               = a.y * psA[j].eps; 
-                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR                   += apos.w * psA[j].q * invR; 
-                    dEdR                   *= invR * invR;
-                    if (!(excl & 0x1))
-                    {
-                        dEdR = 0.0f;
-                    } 
-						  
-                  
-//float dEdR = 0.0f;
-
-                    // ObcGbsaForce1 part
-                    float alpha2_ij         = br * psA[j].br;
-                    float D_ij              = r2 / (4.0f * alpha2_ij);
-                    float expTerm           = exp(-D_ij);
-                    float denominator2      = r2 + alpha2_ij * expTerm;
-                    float denominator       = sqrt(denominator2);
-                    float Gpol              = (apos.w * psA[j].q2) / (denominator * denominator2);
-                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
-                    af.w                   += dGpol_dalpha2_ij * psA[j].br;   
-                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm); 
-
-                    // Add Forces
-                    dx                     *= dEdR; 
-                    dy                     *= dEdR; 
-                    dz                     *= dEdR; 
-                    af.x                   -= dx; 
-                    af.y                   -= dy; 
-                    af.z                   -= dz; 
-                    excl                  >>= 1;               
-                }
-                
-                // Write results
-                int offset                  = x + tgx + (x >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]       = af;
-                cSim.pBornForce[offset]     = af.w;
-            }         
-            else        // 100% utilization
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCDLJObcGbsaN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
+            else
+                kCalculateCDLJObcGbsaN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
+            LAUNCHERROR("kCalculateCDLJObcGbsaN2Forces1");
+            break;
+        case CUTOFF:
+            kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsCutoff");
+            kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
+            result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
+                    gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
+            if (result != CUDPP_SUCCESS)
            {
-                // Read fixed atom data into registers and GRF        
-                unsigned int excl           = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
-                float4 af;
-                sA[threadIdx.x].fx          = af.x = 0.0f;
-                sA[threadIdx.x].fy          = af.y = 0.0f;
-                sA[threadIdx.x].fz          = af.z = 0.0f;
-                sA[threadIdx.x].fb          = af.w = 0.0f;
-                int j                       = y + tgx;
-                float4 temp                 = cSim.pPosq[j];
-                float2 temp1                = cSim.pAttr[j];
-                sA[threadIdx.x].br          = cSim.pBornRadii[j];
-                excl                        = (excl >> tgx) | (excl << (GRID - tgx));                
-                sA[threadIdx.x].x           = temp.x;
-                sA[threadIdx.x].y           = temp.y;
-                sA[threadIdx.x].z           = temp.z;
-                sA[threadIdx.x].q           = cSim.epsfac * temp.w;
-                sA[threadIdx.x].q2          = cSim.preFactor * temp.w;
-                sA[threadIdx.x].sig         = temp1.x;
-                sA[threadIdx.x].eps         = temp1.y;
-                
-                for (j = 0; j < GRID; j++)
-                {
-                    float dx                = psA[tj].x - apos.x; 
-                    float dy                = psA[tj].y - apos.y; 
-                    float dz                = psA[tj].z - apos.z; 
-                    float r2                = dx * dx + dy * dy + dz * dz; 
-                    
-                    // CDLJ part
-                    float invR              = 1.0f / sqrt(r2);
-                    float sig               = a.x + psA[tj].sig; 
-                    float sig2              = invR * sig; 
-                    sig2                   *= sig2;
-                    float sig6              = sig2 * sig2 * sig2; 
-                    float eps               = a.y * psA[tj].eps; 
-						  
-                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR                   += apos.w * psA[tj].q * invR; 
-                    dEdR                   *= invR * invR;
-                    if (!(excl & 0x1))
-                    {
-                        dEdR = 0.0f;
-                    } 
-						
-   
-//float dEdR = 0.0f;
-                    // ObcGbsaForce1 part
-                    float alpha2_ij         = br * psA[tj].br;
-                    float D_ij              = r2 / (4.0f * alpha2_ij);
-                    float expTerm           = exp(-D_ij);
-                    float denominator2      = r2 + alpha2_ij * expTerm;
-                    float denominator       = sqrt(denominator2);
-                    float Gpol              = (apos.w * psA[tj].q2) / (denominator * denominator2);
-                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
-                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;  
-                    psA[tj].fb             += dGpol_dalpha2_ij * br;      
-                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm); 
-                   
-                    // Add forces
-                    dx                     *= dEdR; 
-                    dy                     *= dEdR; 
-                    dz                     *= dEdR; 
-                    af.x                   -= dx; 
-                    af.y                   -= dy; 
-                    af.z                   -= dz;    
-                    psA[tj].fx             += dx; 
-                    psA[tj].fy             += dy; 
-                    psA[tj].fz             += dz;
-                    excl                  >>= 1;
-                    tj                      = sNext[tj]; 
-                }
-                
-                // Write results
-                int offset                  = x + tgx + (y >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]       = af;
-                cSim.pBornForce[offset]     = af.w;
-                offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
-                af.x                        = sA[threadIdx.x].fx;
-                af.y                        = sA[threadIdx.x].fy;
-                af.z                        = sA[threadIdx.x].fz;
-                cSim.pForce4a[offset]       = af;
-                cSim.pBornForce[offset]     = sA[threadIdx.x].fb;
+                printf("Error in cudppCompact: %d\n", result);
+                exit(-1);
            }
-        }
-
-        pos -= cSim.nonbond_workBlock;     
+            gpu->psInteractionCount->Download();
+            if (gpu->bRecalculateBornRadii)
+            {
+                kCalculateObcGbsaBornSum(gpu);
+                kReduceObcGbsaBornSum(gpu);
+            }
+            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            else
+                kCalculateCDLJObcGbsaCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            LAUNCHERROR("kCalculateCDLJCutoffForces");
+            break;
+        case PERIODIC:
+            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsPeriodic");
+            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
+            result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
+                    gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
+            if (result != CUDPP_SUCCESS)
+            {
+                printf("Error in cudppCompact: %d\n", result);
+                exit(-1);
+            }
+            gpu->psInteractionCount->Download();
+            if (gpu->bRecalculateBornRadii)
+            {
+                kCalculateObcGbsaBornSum(gpu);
+                kReduceObcGbsaBornSum(gpu);
+            }
+            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            else
+                kCalculateCDLJObcGbsaPeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            LAUNCHERROR("kCalculateCDLJPeriodicForces");
+            break;
    }
 }
-
-__global__ extern void kCalculateCDLJObcGbsaForces1_12_kernel();
-
-void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
-{
-    //printf("In kCalculateCDLJObcGbsaForces1 QQQ\n");
-
-    // check if Born radii need to be calculated
-
-    if( gpu->bRecalculateBornRadii ){
-	    kCalculateObcGbsaBornSum(gpu);
-		 kReduceObcGbsaBornSum(gpu);
-	 }
-
-    if (gpu->sm_version < SM_12)
-        kCalculateCDLJObcGbsaForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
-    else
-        kCalculateCDLJObcGbsaForces1_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
-
-if( 0 ){
-   static int step = 0;
-//   int numPrint    = -1;
-   step++;
-   //WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
-   //gpuDumpCoordinates( gpu );
-	kReduceBornSumAndForces( gpu );
-   gpuDumpObcLoop1( gpu );
-}
-
-    LAUNCHERROR("kCalculateCDLJObcGbsaForces1");
-}
--- a/platforms/cuda/src/kernels/kCalculateLocalForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateLocalForces.cu
@@ -440,48 +440,154 @@ __global__ void kCalculateLocalForces_kernel()
        pos += blockDim.x * gridDim.x;
    }   

-    while (pos < cSim.LJ14_offset)
-    {  
-        unsigned int pos1       = pos - cSim.rb_dihedral_offset;
-        if (pos1 < cSim.LJ14s)
+    if (cSim.nonbondedMethod == NO_CUTOFF)
+    {
+        while (pos < cSim.LJ14_offset)
        {
-            int4 atom               = cSim.pLJ14ID[pos1];
-            float4 LJ14             = cSim.pLJ14Parameter[pos1];
-            float4 a1               = cSim.pPosq[atom.x];
-            float4 a2               = cSim.pPosq[atom.y];
-            float3 d;
-            d.x                     = a1.x - a2.x;
-            d.y                     = a1.y - a2.y;
-            d.z                     = a1.z - a2.z;
-            float r2                = DOT3(d, d);
-            float inverseR          = 1.0f / sqrt(r2);
-            float sig2              = inverseR * LJ14.y;
-            sig2                   *= sig2;
-            float sig6              = sig2 * sig2 * sig2;
-            float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
-            dEdR                   += LJ14.z * inverseR;
-            dEdR                   *= inverseR * inverseR;
-            unsigned int offsetA    = atom.x + atom.z * cSim.stride;
-            unsigned int offsetB    = atom.y + atom.w * cSim.stride;
-            float4 forceA           = {0.0f, 0.0f, 0.0f, 0.0f}; 
-            if (atom.z < cSim.totalNonbondOutputBuffers)
-                forceA              = cSim.pForce4[offsetA]; 
-            float4 forceB           = {0.0f, 0.0f, 0.0f, 0.0f}; 
-            if (atom.w < cSim.totalNonbondOutputBuffers)
-                forceB              = cSim.pForce4[offsetB]; 
-            d.x                    *= dEdR;
-            d.y                    *= dEdR;
-            d.z                    *= dEdR;
-            forceA.x               += d.x;
-            forceA.y               += d.y;
-            forceA.z               += d.z;
-            forceB.x               -= d.x;
-            forceB.y               -= d.y;
-            forceB.z               -= d.z;        
-            cSim.pForce4[offsetA]   = forceA;
-            cSim.pForce4[offsetB]   = forceB;
-        }        
-        pos                    += blockDim.x * gridDim.x;
+            unsigned int pos1       = pos - cSim.rb_dihedral_offset;
+            if (pos1 < cSim.LJ14s)
+            {
+                int4 atom               = cSim.pLJ14ID[pos1];
+                float4 LJ14             = cSim.pLJ14Parameter[pos1];
+                float4 a1               = cSim.pPosq[atom.x];
+                float4 a2               = cSim.pPosq[atom.y];
+                float3 d;
+                d.x                     = a1.x - a2.x;
+                d.y                     = a1.y - a2.y;
+                d.z                     = a1.z - a2.z;
+                float r2                = DOT3(d, d);
+                float inverseR          = 1.0f / sqrt(r2);
+                float sig2              = inverseR * LJ14.y;
+                sig2                   *= sig2;
+                float sig6              = sig2 * sig2 * sig2;
+                float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
+                dEdR                   += LJ14.z * inverseR;
+                dEdR                   *= inverseR * inverseR;
+                unsigned int offsetA    = atom.x + atom.z * cSim.stride;
+                unsigned int offsetB    = atom.y + atom.w * cSim.stride;
+                float4 forceA           = {0.0f, 0.0f, 0.0f, 0.0f};
+                if (atom.z < cSim.totalNonbondOutputBuffers)
+                    forceA              = cSim.pForce4[offsetA];
+                float4 forceB           = {0.0f, 0.0f, 0.0f, 0.0f};
+                if (atom.w < cSim.totalNonbondOutputBuffers)
+                    forceB              = cSim.pForce4[offsetB];
+                d.x                    *= dEdR;
+                d.y                    *= dEdR;
+                d.z                    *= dEdR;
+                forceA.x               += d.x;
+                forceA.y               += d.y;
+                forceA.z               += d.z;
+                forceB.x               -= d.x;
+                forceB.y               -= d.y;
+                forceB.z               -= d.z;
+                cSim.pForce4[offsetA]   = forceA;
+                cSim.pForce4[offsetB]   = forceB;
+            }
+            pos                    += blockDim.x * gridDim.x;
+        }
+    }
+    else if (cSim.nonbondedMethod == CUTOFF)
+    {
+        while (pos < cSim.LJ14_offset)
+        {
+            unsigned int pos1       = pos - cSim.rb_dihedral_offset;
+            if (pos1 < cSim.LJ14s)
+            {
+                int4 atom               = cSim.pLJ14ID[pos1];
+                float4 LJ14             = cSim.pLJ14Parameter[pos1];
+                float4 a1               = cSim.pPosq[atom.x];
+                float4 a2               = cSim.pPosq[atom.y];
+                float3 d;
+                d.x                     = a1.x - a2.x;
+                d.y                     = a1.y - a2.y;
+                d.z                     = a1.z - a2.z;
+                float r2                = DOT3(d, d);
+                float inverseR          = 1.0f / sqrt(r2);
+                float sig2              = inverseR * LJ14.y;
+                sig2                   *= sig2;
+                float sig6              = sig2 * sig2 * sig2;
+                float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
+                dEdR                   += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
+                dEdR                   *= inverseR * inverseR;
+                if (r2 > cSim.nonbondedCutoffSqr)
+                {
+                    dEdR = 0.0f;
+                }
+                unsigned int offsetA    = atom.x + atom.z * cSim.stride;
+                unsigned int offsetB    = atom.y + atom.w * cSim.stride;
+                float4 forceA           = {0.0f, 0.0f, 0.0f, 0.0f};
+                if (atom.z < cSim.totalNonbondOutputBuffers)
+                    forceA              = cSim.pForce4[offsetA];
+                float4 forceB           = {0.0f, 0.0f, 0.0f, 0.0f};
+                if (atom.w < cSim.totalNonbondOutputBuffers)
+                    forceB              = cSim.pForce4[offsetB];
+                d.x                    *= dEdR;
+                d.y                    *= dEdR;
+                d.z                    *= dEdR;
+                forceA.x               += d.x;
+                forceA.y               += d.y;
+                forceA.z               += d.z;
+                forceB.x               -= d.x;
+                forceB.y               -= d.y;
+                forceB.z               -= d.z;
+                cSim.pForce4[offsetA]   = forceA;
+                cSim.pForce4[offsetB]   = forceB;
+            }
+            pos                    += blockDim.x * gridDim.x;
+        }
+    }
+    else if (cSim.nonbondedMethod == PERIODIC)
+    {
+        while (pos < cSim.LJ14_offset)
+        {
+            unsigned int pos1       = pos - cSim.rb_dihedral_offset;
+            if (pos1 < cSim.LJ14s)
+            {
+                int4 atom               = cSim.pLJ14ID[pos1];
+                float4 LJ14             = cSim.pLJ14Parameter[pos1];
+                float4 a1               = cSim.pPosq[atom.x];
+                float4 a2               = cSim.pPosq[atom.y];
+                float3 d;
+                d.x                     = a1.x - a2.x;
+                d.y                     = a1.y - a2.y;
+                d.z                     = a1.z - a2.z;
+                d.x                     -= floor(d.x/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                d.y                     -= floor(d.x/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                d.z                     -= floor(d.x/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+                float r2                = DOT3(d, d);
+                float inverseR          = 1.0f / sqrt(r2);
+                float sig2              = inverseR * LJ14.y;
+                sig2                   *= sig2;
+                float sig6              = sig2 * sig2 * sig2;
+                float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
+                dEdR                   += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
+                dEdR                   *= inverseR * inverseR;
+                if (r2 > cSim.nonbondedCutoffSqr)
+                {
+                    dEdR = 0.0f;
+                }
+                unsigned int offsetA    = atom.x + atom.z * cSim.stride;
+                unsigned int offsetB    = atom.y + atom.w * cSim.stride;
+                float4 forceA           = {0.0f, 0.0f, 0.0f, 0.0f};
+                if (atom.z < cSim.totalNonbondOutputBuffers)
+                    forceA              = cSim.pForce4[offsetA];
+                float4 forceB           = {0.0f, 0.0f, 0.0f, 0.0f};
+                if (atom.w < cSim.totalNonbondOutputBuffers)
+                    forceB              = cSim.pForce4[offsetB];
+                d.x                    *= dEdR;
+                d.y                    *= dEdR;
+                d.z                    *= dEdR;
+                forceA.x               += d.x;
+                forceA.y               += d.y;
+                forceA.z               += d.z;
+                forceB.x               -= d.x;
+                forceB.y               -= d.y;
+                forceB.z               -= d.z;
+                cSim.pForce4[offsetA]   = forceA;
+                cSim.pForce4[offsetB]   = forceB;
+            }
+            pos                    += blockDim.x * gridDim.x;
+        }
    }

 }

--- a/platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
@@ -53,10 +53,6 @@ struct Atom {
    float junk;
 };

-__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
-__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
-__shared__ unsigned int sNext[GRID];
-
 static __constant__ cudaGmxSimulation cSim;

 void SetCalculateObcGbsaBornSumSim(gpuContext gpu)
@@ -73,6 +69,50 @@ void GetCalculateObcGbsaBornSumSim(gpuContext gpu)
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
 }

+// Include versions of the kernels for N^2 calculations.
+
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateObcGbsaBornSum.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateObcGbsaBornSum.h"
+
+// Include versions of the kernels with cutoffs.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_CUTOFF
+#define METHOD_NAME(a, b) a##Cutoff##b
+#include "kCalculateObcGbsaBornSum.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
+#include "kCalculateObcGbsaBornSum.h"
+
+// Include versions of the kernels with periodic boundary conditions.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_PERIODIC
+#define METHOD_NAME(a, b) a##Periodic##b
+#include "kCalculateObcGbsaBornSum.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##PeriodicByWarp##b
+#include "kCalculateObcGbsaBornSum.h"
+
+
+__global__ void kClearObcGbsaBornSum_kernel()
+{
+    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
+    while (pos < cSim.stride * cSim.nonbondOutputBuffers)
+    {
+        ((float*)cSim.pBornSum)[pos] = 0.0f;
+        pos += gridDim.x * blockDim.x;
+    }
+}
+
 __global__ void kReduceObcGbsaBornSum_kernel()
 {
    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
@@ -127,175 +167,40 @@ if( 0 ){
    LAUNCHERROR("kReduceObcGbsaBornSum");
 }

-
-__global__ void kCalculateObcGbsaBornSum_kernel()
-{
-    // Read queue of work blocks once so the remainder of
-    // kernel can run asynchronously    
-    int pos = (blockIdx.x * cSim.workUnits) / gridDim.x;
-    int end = ((blockIdx.x + 1) * cSim.workUnits) / gridDim.x;
-    if (threadIdx.x < end - pos)
-    {
-        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
-    }
-    if (threadIdx.x < GRID)
-    {
-        sNext[threadIdx.x] = (threadIdx.x - 1) & (GRID - 1);
-    }
-    __syncthreads();
-
-    // Now change pos and end to reflect work queue just read
-    // into shared memory
-    end = end - pos; 
-    pos = end - (threadIdx.x >> GRIDBITS) - 1;
-       
-    while (pos >= 0)
-    {  
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x = sWorkUnit[pos];
-        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
-        x = (x >> 17) << GRIDBITS;
-        float       dx; 
-        float       dy; 
-        float       dz; 
-        float       r2; 
-        float       r;
-
-        unsigned int tgx = threadIdx.x & (GRID - 1);
-        unsigned int tbx = threadIdx.x - tgx;
-        int tj = tgx; 
-        Atom* psA = &sA[tbx];
-     
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        { 
-            // Read fixed atom data into registers and GRF       
-            unsigned int i = x + tgx;
-            float4 apos = cSim.pPosq[i];    // Local atom x, y, z, sum
-            float2 ar = cSim.pObcData[i];   // Local atom vr, sr
-            sA[threadIdx.x].x           = apos.x;
-            sA[threadIdx.x].y           = apos.y;
-            sA[threadIdx.x].z           = apos.z;
-            sA[threadIdx.x].r           = ar.x;
-            sA[threadIdx.x].sr          = ar.y;
-            apos.w                      = 0.0f;
-
-            for (unsigned int j = 0; j < GRID; j++)
-            {
-                dx                      = psA[j].x - apos.x;
-                dy                      = psA[j].y - apos.y;
-                dz                      = psA[j].z - apos.z;
-                r2                      = dx * dx + dy * dy + dz * dz; 
-                r                       = sqrt(r2);
-                float rInverse          = 1.0f / r; 
-                float rScaledRadiusJ    = r + psA[j].sr;
-                if ((j != tgx) && (ar.x < rScaledRadiusJ))
-                {
-                    float l_ij     = 1.0f / max(ar.x, fabs(r - psA[j].sr));
-                    float u_ij     = 1.0f / rScaledRadiusJ;
-                    float l_ij2    = l_ij * l_ij;
-                    float u_ij2    = u_ij * u_ij;
-                    float ratio    = log(u_ij / l_ij);
-                    apos.w        += l_ij - 
-                                     u_ij + 
-                                     0.25f * r * (u_ij2 - l_ij2) + 
-                                     (0.50f * rInverse * ratio) + 
-                                     (0.25f * psA[j].sr * psA[j].sr * rInverse) *
-                                     (l_ij2 - u_ij2);
-                                                                                                              
-                    if (ar.x < (psA[j].r - r))
-                    {
-                        apos.w += 2.0f * ((1.0f / ar.x) - l_ij);
-                    }
-                }
-            }             
-
-            // Write results
-            int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = apos.w;
-        }         
-        else        // 100% utilization
-        {
-            // Read fixed atom data into registers and GRF
-            int j                           = y + tgx;
-            unsigned int i                  = x + tgx;      
-            
-            float4 temp                     = cSim.pPosq[j];
-            float2 temp1                    = cSim.pObcData[j];
-            float4 apos                     = cSim.pPosq[i];        // Local atom x, y, z, sum
-            float2 ar                       = cSim.pObcData[i];    // Local atom vr, sr
-            sA[threadIdx.x].x               = temp.x;
-            sA[threadIdx.x].y               = temp.y;
-            sA[threadIdx.x].z               = temp.z;
-            sA[threadIdx.x].r               = temp1.x;
-            sA[threadIdx.x].sr              = temp1.y;
-            sA[threadIdx.x].sum = apos.w    = 0.0f;
-
-            for (unsigned int j = 0; j < GRID; j++)
-            {
-                dx                      = psA[tj].x - apos.x; 
-                dy                      = psA[tj].y - apos.y; 
-                dz                      = psA[tj].z - apos.z; 
-                r2                      = dx * dx + dy * dy + dz * dz; 
-                r                       = sqrt(r2);
-                float rInverse          = 1.0f / r; 
-                float rScaledRadiusJ    = r + psA[tj].sr;
-                if (ar.x < rScaledRadiusJ)
-                {
-                    float l_ij     = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
-                    float u_ij     = 1.0f / rScaledRadiusJ;
-                    float l_ij2    = l_ij * l_ij;
-                    float u_ij2    = u_ij * u_ij;
-                    float ratio    = log(u_ij / l_ij);
-                    float term     = l_ij - 
-                                     u_ij + 
-                                     0.25f * r * (u_ij2 - l_ij2) + 
-                                     (0.50f * rInverse * ratio) + 
-                                     (0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
-                                     (l_ij2 - u_ij2);
-                    if (ar.x < (psA[tj].sr - r))
-                    {
-                        term += 2.0f * ((1.0f / ar.x) - l_ij);
-                    }
-                    apos.w        += term;
-                }
-                float rScaledRadiusI    = r + ar.y;
-                if (psA[tj].r < rScaledRadiusI)
-                {
-                    float l_ij     = 1.0f / max(psA[tj].r, fabs(r - ar.y));
-                    float u_ij     = 1.0f / rScaledRadiusI;
-                    float l_ij2    = l_ij * l_ij;
-                    float u_ij2    = u_ij * u_ij;
-                    float ratio    = log(u_ij / l_ij);
-                    float term     = l_ij - 
-                                     u_ij + 
-                                     0.25f * r * (u_ij2 - l_ij2) + 
-                                     (0.50f * rInverse * ratio) + 
-                                     (0.25f * ar.y * ar.y * rInverse) *
-                                     (l_ij2 - u_ij2);
- 
-                    if (psA[tj].r < (ar.y - r))
-                    {
-                        term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
-                    }
-                    psA[tj].sum    += term;
-                }      
-                tj = sNext[tj];
-            }    
-                
-            // Write results
-            int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = apos.w;
-            offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = sA[threadIdx.x].sum;
-        }       
-       
-        pos -= cSim.nonbond_workBlock;     
-    }
-}
-
 void kCalculateObcGbsaBornSum(gpuContext gpu)
 {
  //  printf("kCalculateObcgbsaBornSum\n");
-    kCalculateObcGbsaBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+    kClearObcGbsaBornSum_kernel<<<gpu->sim.blocks, 384>>>();
+    LAUNCHERROR("kClearBornSum");
+    size_t numWithInteractions;
+    switch (gpu->sim.nonbondedMethod)
+    {
+        case NO_CUTOFF:
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
+            else
+                kCalculateObcGbsaN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
+            break;
+        case CUTOFF:
+            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            else
+                kCalculateObcGbsaCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            break;
+        case PERIODIC:
+            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            else
+                kCalculateObcGbsaPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            break;
+    }
    LAUNCHERROR("kCalculateBornSum");
 }
--- a/platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
@@ -52,18 +52,9 @@ struct Atom {
    float fy;
    float fz;
    float fb;
-//    float sum;
-//    float oneOverR;
-    int pos;
-    int wx;
-    int wy;
 };


-__shared__ Atom sA[G8X_BORNFORCE2_THREADS_PER_BLOCK];
-__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
-__shared__ unsigned int sNext[GRID];
-
 static __constant__ cudaGmxSimulation cSim;

 void SetCalculateObcGbsaForces2Sim(gpuContext gpu)
@@ -80,283 +71,72 @@ void GetCalculateObcGbsaForces2Sim(gpuContext gpu)
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
 }

-__global__ void kCalculateObcGbsaForces2_kernel()
-{
-    // Read queue of work blocks once so the remainder of
-    // kernel can run asynchronously    
-    int pos = cSim.bf2WorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.bf2WorkUnitsPerBlockRemainder);
-    int end = cSim.bf2WorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.bf2WorkUnitsPerBlockRemainder);    
-    if (threadIdx.x < end - pos)
-    {
-        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
-    }
-    if (threadIdx.x < GRID)
-    {
-        sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
-    }
-    __syncthreads();

-    // Now change pos and end to reflect work queue just read
-    // into shared memory
-    end = end - pos; 
-    sA[threadIdx.x].pos = end - (threadIdx.x >> GRIDBITS) - 1;
-       
-    while (sA[threadIdx.x].pos >= 0)
-    {  
-    
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x = sWorkUnit[sA[threadIdx.x].pos];
-        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
-        x = (x >> 17) << GRIDBITS;
-        unsigned int tgx                = threadIdx.x & (GRID - 1);
-        unsigned int i                  = x + tgx;
-        float4 apos                     = cSim.pPosq[i];
-        float2 a                        = cSim.pObcData[i];
-        float fb                        = cSim.pBornForce[i];
-        unsigned int tbx                = threadIdx.x - tgx;
-        int tj                          = tgx; 
-        Atom* psA                       = &sA[tbx];
-        sA[threadIdx.x].wx              = x;
-        sA[threadIdx.x].wy              = y; 
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        { 
-            // Read fixed atom data into registers and GRF
-            float3 af;
-            sA[threadIdx.x].fx = af.x   = 0.0f;
-            sA[threadIdx.x].fy = af.y   = 0.0f;
-            sA[threadIdx.x].fz = af.z   = 0.0f;
-//            float sum                   = 0.0f;
-            sA[threadIdx.x].x           = apos.x;
-            sA[threadIdx.x].y           = apos.y;
-            sA[threadIdx.x].z           = apos.z;
-//            float oneOverR              = 1.0f / a.x;
-            sA[threadIdx.x].r           = a.x;
-            sA[threadIdx.x].sr          = a.y;
-            sA[threadIdx.x].sr2         = a.y * a.y;
-            sA[threadIdx.x].fb          = fb;
+// Include versions of the kernels for N^2 calculations.

-            for (unsigned int j = sNext[tgx]; j != tgx; j = sNext[j])
-            {
-                float dx                = psA[j].x - apos.x; 
-                float dy                = psA[j].y - apos.y; 
-                float dz                = psA[j].z - apos.z; 
-                float r2                = dx * dx + dy * dy + dz * dz;
-                float r                 = sqrt(r2);
-                
-                // Atom I Born forces and sum
-                float rScaledRadiusJ    = r + psA[j].sr; 
-                float l_ij              = 1.0f / max(a.x, fabs(r - psA[j].sr));
-                float u_ij              = 1.0f / rScaledRadiusJ;
-                float rInverse          = 1.0f / r;
-                float l_ij2             = l_ij * l_ij;
-                float u_ij2             = u_ij * u_ij; 
-                float r2Inverse         = rInverse * rInverse;                   
-                float t1                = log (u_ij / l_ij);
-                float t2                = (l_ij2 - u_ij2);
-                float t3                = t2 * rInverse;
-                t1                     *= rInverse;
-                
-                // Born Forces term
-                float term              =  0.125f * 
-                                          (1.000f + psA[j].sr2 * r2Inverse) * t3 + 
-                                           0.250f * t1 * r2Inverse;
-                float dE                = fb * term;
-                
-                // Born sum term
-//                term                    =   l_ij - u_ij  +
-//                                           -0.25f * r * t2 +
-//                                            0.50f * t1 +
-//                                           (0.25f * psA[j].sr2) * t3;
-//                if (a.x < (psA[j].sr - r))
-//                {
-//                    term               += 2.0f * (oneOverR - l_ij);
-//                }
-                if (a.x >= rScaledRadiusJ)
-                {
-                    dE                  = /*term =*/ 0.0f;
-                }
-                
-                float d                 = dx * dE;
-                af.x                   -= d;
-                psA[j].fx              += d;
-                d                       = dy * dE;  
-                af.y                   -= d;
-                psA[j].fy              += d;
-                d                       = dz * dE;
-                af.z                   -= d;
-                psA[j].fz              += d; 
-//                sum                    += term;
-            }
-            
-            // Write results
-            int offset                  = x + tgx + (x >> GRIDBITS) * cSim.stride;
-            float4 of;
-            of.x                        = af.x + sA[threadIdx.x].fx;
-            of.y                        = af.y + sA[threadIdx.x].fy;
-            of.z                        = af.z + sA[threadIdx.x].fz;
-            of.w                        = 0.0f;
-            cSim.pForce4b[offset]       = of;
-//            cSim.pBornSum[offset]       = sum;
-        }         
-        else 
-        {        
-            // Read fixed atom data into registers and GRF
-            int j                       = y + tgx;
-            float4 temp                 = cSim.pPosq[j];
-            float2 temp1                = cSim.pObcData[j];
-            sA[threadIdx.x].fb          = cSim.pBornForce[j];
-            float3 af;
-            sA[threadIdx.x].fx = af.x   = 0.0f;
-            sA[threadIdx.x].fy = af.y   = 0.0f;
-            sA[threadIdx.x].fz = af.z   = 0.0f;
-//            sA[threadIdx.x].sum         = 0.0f;
-//            float sum                   = 0.0f;
-            float sr2                   = a.y * a.y;
-            sA[threadIdx.x].x           = temp.x;
-            sA[threadIdx.x].y           = temp.y;
-            sA[threadIdx.x].z           = temp.z;
-            sA[threadIdx.x].r           = temp1.x;
-            sA[threadIdx.x].sr          = temp1.y;
-            sA[threadIdx.x].sr2         = temp1.y * temp1.y;
-//            sA[threadIdx.x].oneOverR    = 1.0f / temp1.x;
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateObcGbsaForces2.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateObcGbsaForces2.h"

-            for (j = 0; j < GRID; j++)
-            {
-                float dx                = psA[tj].x - apos.x; 
-                float dy                = psA[tj].y - apos.y; 
-                float dz                = psA[tj].z - apos.z; 
-                float r2                = dx * dx + dy * dy + dz * dz; 
-                float r                 = sqrt(r2);
-                
-                // Atom I Born Forces and sum
-                float r2Inverse         = 1.0f / r2;
-                float rScaledRadiusJ    = r + psA[tj].sr;
-                float rInverse          = 1.0f / r;
-                
-                
-                float l_ij              = 1.0f / max(a.x, fabs(r - psA[tj].sr));
-                float u_ij              = 1.0f / rScaledRadiusJ;
-                float l_ij2             = l_ij * l_ij;
-                float u_ij2             = u_ij * u_ij;
-                float t1                = log (u_ij / l_ij);
-                float t2                = (l_ij2 - u_ij2);
-                float t3                = t2 * rInverse;
-                t1                     *= rInverse;
-                   
-                // Born Forces term
-                float term              =  0.125f * 
-                                          (1.000f + psA[tj].sr2 * r2Inverse) * t3 + 
-                                           0.250f * t1 * r2Inverse;
-                float dE                = fb * term;
-                 // Born sum term
-//                term                    =   l_ij - u_ij  +
-//                                           -0.25f * r * t2 +
-//                                            0.50f * t1 +
-//                                           (0.25f * psA[tj].sr2) * t3;
-//                if (a.x < (psA[tj].sr - r))
-//                {
-//                    term               += 2.0f * ((1.0f / a.x) - l_ij);
-//                }
-                if (a.x >= rScaledRadiusJ) 
-                {
-                    dE                  = /*term =*/ 0.0f;
-                }
-                
-                
-                float d                 = dx * dE;
-                af.x                   -= d;
-                psA[tj].fx             += d;
-                d                       = dy * dE;  
-                af.y                   -= d;
-                psA[tj].fy             += d;
-                d                       = dz * dE;
-                af.z                   -= d;
-                psA[tj].fz             += d;    
-//                sum                    += term;
-                
-                // Atom J Born Forces and sum               
-                float rScaledRadiusI    = r + a.y;
-                l_ij                    = 1.0f / max(psA[tj].r, fabs(r - a.y));
-                u_ij                    = 1.0f / rScaledRadiusI;
-                l_ij2                   = l_ij * l_ij;
-                u_ij2                   = u_ij * u_ij;
-                t1                      = log (u_ij / l_ij);
-                t2                      = (l_ij2 - u_ij2);
-                t3                      = t2 * rInverse;
-                t1                     *= rInverse;
-                  
-                // Born Forces term
-                term                    =  0.125f * 
-                                          (1.000f + sr2 * r2Inverse) * t3 + 
-                                           0.250f * t1 * r2Inverse;
-                dE                      = psA[tj].fb * term;  
-                
-                // Born sum term
-//                term                    =   l_ij - u_ij  +
-//                                           -0.25f * r * t2 +
-//                                            0.50f * t1 +
-//                                           (0.25f * sr2) * t3;
-//
-//                if (psA[tj].r < (a.y - r))
-//                {
-//                    term               +=  2.0f * (psA[tj].oneOverR - l_ij);
-//                }
-                if (psA[tj].r >= rScaledRadiusI) 
-                {
-                    dE                  = /*term =*/ 0.0f;
-                }                                    
-                dx                     *= dE;
-                dy                     *= dE;
-                dz                     *= dE;
-                psA[tj].fx             += dx; 
-                psA[tj].fy             += dy;
-                psA[tj].fz             += dz; 
-                af.x                   -= dx;
-                af.y                   -= dy;
-                af.z                   -= dz;
-//                psA[tj].sum            +=  term;
-                                       
-                tj                      = sNext[tj]; 
-            }
-                
-            // Write results
-            int offset                  = sA[threadIdx.x].wx + tgx + (sA[threadIdx.x].wy >> GRIDBITS) * cSim.stride;
-            float4 of;
-            of.x                        = af.x;
-            of.y                        = af.y;
-            of.z                        = af.z;
-            of.w                        = 0.0f;
-            cSim.pForce4b[offset]       = of;
-//            cSim.pBornSum[offset]       = sum;
-            offset                      = sA[threadIdx.x].wy + tgx + (sA[threadIdx.x].wx >> GRIDBITS) * cSim.stride;
-            of.x                        = sA[threadIdx.x].fx;
-            of.y                        = sA[threadIdx.x].fy;
-            of.z                        = sA[threadIdx.x].fz;
-            cSim.pForce4b[offset]       = of;
-//            cSim.pBornSum[offset]       = sA[threadIdx.x].sum;
-        }
-        sA[threadIdx.x].pos            -= cSim.bornForce2_workBlock;     
-    }
-}
+// Include versions of the kernels with cutoffs.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_CUTOFF
+#define METHOD_NAME(a, b) a##Cutoff##b
+#include "kCalculateObcGbsaForces2.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
+#include "kCalculateObcGbsaForces2.h"

-__global__ extern void kCalculateObcGbsaForces2_12_kernel();
+// Include versions of the kernels with periodic boundary conditions.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_PERIODIC
+#define METHOD_NAME(a, b) a##Periodic##b
+#include "kCalculateObcGbsaForces2.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##PeriodicByWarp##b
+#include "kCalculateObcGbsaForces2.h"

 void kCalculateObcGbsaForces2(gpuContext gpu)
 {
    //printf("kCalculateObcGbsaForces2\n");
-    if (gpu->sm_version < SM_12)
-        kCalculateObcGbsaForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>();
-    else
-        kCalculateObcGbsaForces2_12_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>();
-if( 0 ){
-   static int step = 0;
-	//int numPrint    = -1;
-	step++;
-	//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
-	//gpuDumpCoordinates( gpu );
-	kReduceBornSumAndForces( gpu );
-	gpuDumpObcLoop1( gpu );
-}
-
+    size_t numWithInteractions;
+    switch (gpu->sim.nonbondedMethod)
+    {
+        case NO_CUTOFF:
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
+            else
+                kCalculateObcGbsaN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
+            break;
+        case CUTOFF:
+            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            else
+                kCalculateObcGbsaCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            break;
+        case PERIODIC:
+            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaPeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            else
+                kCalculateObcGbsaPeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            break;
+    }
    LAUNCHERROR("kCalculateObcGbsaForces2");
 }
--- a/platforms/cuda/src/kernels/kForces.cu
+++ b/platforms/cuda/src/kernels/kForces.cu
@@ -61,9 +61,9 @@ void GetForcesSim(gpuContext gpu)
 __global__ void kClearForces_kernel()
 {
    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    while (pos < cSim.stride4 * cSim.outputBuffers)
+    while (pos < cSim.stride * cSim.outputBuffers)
    {
-        ((float*)cSim.pForce4)[pos] = 0.0f;
+        cSim.pForce4[pos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
        pos += gridDim.x * blockDim.x;
    }
 }

--- a/platforms/cuda/src/kernels/kVerletUpdate.cu
+++ b/platforms/cuda/src/kernels/kVerletUpdate.cu
@@ -61,7 +61,6 @@ void GetVerletUpdateSim(gpuContext gpu)
 __global__ void kVerletUpdatePart1_kernel()
 {
    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
-    __syncthreads();
    
    while (pos < cSim.atoms)
    {
@@ -175,7 +174,6 @@ void kVerletUpdatePart1(gpuContext gpu)
 __global__ void kVerletUpdatePart2_kernel()
 {
    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
-    __syncthreads();
    
    while (pos < cSim.atoms)
    {
@@ -208,7 +206,6 @@ __global__ void kVerletUpdatePart2CM_kernel()
    extern __shared__ float3 sCM[];
    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
    float3 CM                   = {0.0f, 0.0f, 0.0f};
-    __syncthreads();
    
    while (pos < cSim.atoms)
    {

--- a/platforms/reference/src/ReferenceKernels.cpp
+++ b/platforms/reference/src/ReferenceKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
+ * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -419,6 +419,26 @@ void ReferenceCalcGBSAOBCForceKernel::initialize(const System& system, const GBS
    obcParameters->setScaledRadiusFactors(scaleFactors);
    obcParameters->setSolventDielectric( static_cast<RealOpenMM>(force.getSolventDielectric()) );
    obcParameters->setSoluteDielectric( static_cast<RealOpenMM>(force.getSoluteDielectric()) );
+
+    // If there is a NonbondedForce in this system, use it to initialize cutoffs and periodic boundary conditions.
+
+    for (int i = 0; i < system.getNumForces(); i++) {
+        const NonbondedForce* nonbonded = dynamic_cast<const NonbondedForce*>(&system.getForce(i));
+        if (nonbonded != NULL) {
+            if (nonbonded->getNonbondedMethod() != NonbondedForce::NoCutoff)
+                obcParameters->setUseCutoff(nonbonded->getCutoffDistance());
+            if (nonbonded->getNonbondedMethod() == NonbondedForce::CutoffPeriodic) {
+                Vec3 boxVectors[3];
+                nonbonded->getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+                RealOpenMM periodicBoxSize[3];
+                periodicBoxSize[0] = (RealOpenMM) boxVectors[0][0];
+                periodicBoxSize[1] = (RealOpenMM) boxVectors[1][1];
+                periodicBoxSize[2] = (RealOpenMM) boxVectors[2][2];
+                obcParameters->setPeriodic(periodicBoxSize);
+            }
+            break;
+        }
+    }
    obc = new CpuObc(obcParameters);
    obc->setIncludeAceApproximation(true);
 }