Checking in Cuda implementation of explicit solvent

1010df33 · Peter Eastman · df4b64cb · 1010df33 · 1010df33 · 1010df33
Commit 1010df33 authored Feb 19, 2009 by Peter Eastman
20 changed files
--- a/platforms/cuda/CMakeLists.txt
+++ b/platforms/cuda/CMakeLists.txt
@@ -84,4 +84,14 @@ INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)

 SET(FINDCUDA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake)

+IF (APPLE)
+    LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/cudpp/mac)
+ELSE (APPLE)
+    IF (WIN32)
+        LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/cudpp/win)
+        INSTALL_FILES(/lib FILES ${CMAKE_CURRENT_SOURCE_DIR}/cudpp/win/cudpp32.dll)
+    ELSE (WIN32)
+        LINK_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/cudpp/linux)
+    ENDIF (WIN32)
+ENDIF(APPLE)
 SUBDIRS (sharedTarget staticTarget)
--- a/platforms/cuda/cudpp/mac/libcudpp.a
+++ b/platforms/cuda/cudpp/mac/libcudpp.a
--- a/platforms/cuda/cudpp/mac/libcutil.a
+++ b/platforms/cuda/cudpp/mac/libcutil.a
--- a/platforms/cuda/cudpp/win/cudpp.lib
+++ b/platforms/cuda/cudpp/win/cudpp.lib
--- a/platforms/cuda/cudpp/win/cudpp32.dll
+++ b/platforms/cuda/cudpp/win/cudpp32.dll
--- a/platforms/cuda/cudpp/win/cutil.lib
+++ b/platforms/cuda/cudpp/win/cutil.lib
--- a/platforms/cuda/src/CudaStreamFactory.cpp
+++ b/platforms/cuda/src/CudaStreamFactory.cpp
@@ -39,42 +39,40 @@
 using namespace OpenMM;

 StreamImpl* CudaStreamFactory::createStreamImpl(std::string name, int size, Stream::DataType type, const Platform& platform, OpenMMContextImpl& context) const {
-    if (name == "particlePositions") {
    CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
+    if (name == "particlePositions") {
        float padding[] = {100000.0f, 100000.0f, 100000.0f, 0.2f};
        return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psPosq4, 4, padding, data.gpu);
    }
    if (name == "particleVelocities") {
-        CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
        float padding[] = {0.0f, 0.0f, 0.0f, 0.0f};
        return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psVelm4, 4, padding, data.gpu);
    }
    if (name == "particleForces") {
-        CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
        float padding[] = {0.0f, 0.0f, 0.0f, 0.0f};
        return new CudaStreamImpl<float4>(name, size, type, platform, data.gpu->psForce4, 4, padding, data.gpu);
    }
    switch (type) {
    case Stream::Float:
    case Stream::Double:
-        return new CudaStreamImpl<float1>(name, size, type, platform, 1, NULL);
+        return new CudaStreamImpl<float1>(name, size, type, platform, 1, data.gpu);
    case Stream::Float2:
    case Stream::Double2:
-        return new CudaStreamImpl<float2>(name, size, type, platform, 1, NULL);
+        return new CudaStreamImpl<float2>(name, size, type, platform, 1, data.gpu);
    case Stream::Float3:
    case Stream::Double3:
-        return new CudaStreamImpl<float3>(name, size, type, platform, 1, NULL);
+        return new CudaStreamImpl<float3>(name, size, type, platform, 1, data.gpu);
    case Stream::Float4:
    case Stream::Double4:
-        return new CudaStreamImpl<float4>(name, size, type, platform, 1, NULL);
+        return new CudaStreamImpl<float4>(name, size, type, platform, 1, data.gpu);
    case Stream::Integer:
-        return new CudaStreamImpl<int1>(name, size, type, platform, 1, NULL);
+        return new CudaStreamImpl<int1>(name, size, type, platform, 1, data.gpu);
    case Stream::Integer2:
-        return new CudaStreamImpl<int2>(name, size, type, platform, 1, NULL);
+        return new CudaStreamImpl<int2>(name, size, type, platform, 1, data.gpu);
    case Stream::Integer3:
-        return new CudaStreamImpl<int3>(name, size, type, platform, 1, NULL);
+        return new CudaStreamImpl<int3>(name, size, type, platform, 1, data.gpu);
    case Stream::Integer4:
-        return new CudaStreamImpl<int4>(name, size, type, platform, 1, NULL);
+        return new CudaStreamImpl<int4>(name, size, type, platform, 1, data.gpu);
    }
    throw OpenMMException("Tried to create a Stream with an illegal DataType.");
 }
--- a/platforms/cuda/src/CudaStreamImpl.h
+++ b/platforms/cuda/src/CudaStreamImpl.h
@@ -132,23 +132,24 @@ CudaStreamImpl<T>::~CudaStreamImpl() {
 template <class T>
 void CudaStreamImpl<T>::loadFromArray(const void* array) {
    float* data = reinterpret_cast<float*>(stream->_pSysData);
+    int* order = gpu->psAtomIndex->_pSysData;
    if (baseType == Stream::Float) {
        float* arrayData = (float*) array;
        for (int i = 0; i < getSize(); ++i)
            for (int j = 0; j < width; ++j)
-                data[i*rowOffset+j] = arrayData[i*width+j];
+                data[i*rowOffset+j] = arrayData[order[i]*width+j];
    }
    else if (baseType == Stream::Double) {
        double* arrayData = (double*) array;
        for (int i = 0; i < getSize(); ++i)
            for (int j = 0; j < width; ++j)
-                data[i*rowOffset+j] = (float) arrayData[i*width+j];
+                data[i*rowOffset+j] = (float) arrayData[order[i]*width+j];
    }
    else {
        int* arrayData = (int*) array;
        for (int i = 0; i < getSize(); ++i)
            for (int j = 0; j < width; ++j)
-                data[i*rowOffset+j] = (float) arrayData[i*width+j];
+                data[i*rowOffset+j] = (float) arrayData[order[i]*width+j];
    }
    for (int i = getSize(); i < (int) stream->_length; ++i)
        for (int j = 0; j < rowOffset; ++j)
@@ -167,23 +168,24 @@ template <class T>
 void CudaStreamImpl<T>::saveToArray(void* array) {
    stream->Download();
    float* data = reinterpret_cast<float*>(stream->_pSysData);
+    int* order = gpu->psAtomIndex->_pSysData;
    if (baseType == Stream::Float) {
        float* arrayData = (float*) array;
        for (int i = 0; i < getSize(); ++i)
            for (int j = 0; j < width; ++j)
-                arrayData[i*width+j] = data[i*rowOffset+j];
+                arrayData[order[i]*width+j] = data[i*rowOffset+j];
    }
    else if (baseType == Stream::Double) {
        double* arrayData = (double*) array;
        for (int i = 0; i < getSize(); ++i)
            for (int j = 0; j < width; ++j)
-                arrayData[i*width+j] = data[i*rowOffset+j];
+                arrayData[order[i]*width+j] = data[i*rowOffset+j];
    }
    else {
        int* arrayData = (int*) array;
        for (int i = 0; i < getSize(); ++i)
            for (int j = 0; j < width; ++j)
-                arrayData[i*width+j] = (int) data[i*rowOffset+j];
+                arrayData[order[i]*width+j] = (int) data[i*rowOffset+j];
    }
 }


--- a/platforms/cuda/src/kernels/cudaKernels.h
+++ b/platforms/cuda/src/kernels/cudaKernels.h
@@ -41,19 +41,19 @@ extern void kGenerateRandoms(gpuContext gpu);
 extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu);
 extern void kCalculateCDLJObcGbsaForces1_12(gpuContext gpu);
 extern void kCalculateCDLJForces(gpuContext gpu);
-extern void kCalculateCDLJForces_12(gpuContext gpu);
 extern void kCalculateObcGbsaForces1(gpuContext gpu);
 extern void kCalculateObcGbsaForces1_12(gpuContext gpu);
 extern void kReduceObcGbsaBornForces(gpuContext gpu);
 extern void kCalculateObcGbsaForces2(gpuContext gpu);
-extern void kCalculateObcGbsaForces2_12(gpuContext gpu);
 extern void kCalculateLocalForces(gpuContext gpu);
 extern void kCalculateAndersenThermostat(gpuContext gpu);
 extern void kReduceBornSumAndForces(gpuContext gpu);
 extern void kUpdatePart1(gpuContext gpu);
 extern void kApplyFirstShake(gpuContext gpu);
+extern void kApplyFirstSettle(gpuContext gpu);
 extern void kUpdatePart2(gpuContext gpu);
 extern void kApplySecondShake(gpuContext gpu);
+extern void kApplySecondSettle(gpuContext gpu);
 extern void kVerletUpdatePart1(gpuContext gpu);
 extern void kVerletUpdatePart2(gpuContext gpu);
 extern void kBrownianUpdatePart1(gpuContext gpu);
@@ -66,12 +66,8 @@ extern void kClearBornForces(gpuContext gpu);
 // Initializers
 extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
 extern void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
-extern void SetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu);
-extern void GetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu);
 extern void SetCalculateCDLJForcesSim(gpuContext gpu);
 extern void GetCalculateCDLJForcesSim(gpuContext gpu);
-extern void SetCalculateCDLJForces_12Sim(gpuContext gpu);
-extern void GetCalculateCDLJForces_12Sim(gpuContext gpu);
 extern void SetCalculateLocalForcesSim(gpuContext gpu);
 extern void GetCalculateLocalForcesSim(gpuContext gpu);
 extern void SetCalculateObcGbsaBornSumSim(gpuContext gpu);
@@ -82,14 +78,14 @@ extern void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
 extern void GetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
 extern void SetCalculateObcGbsaForces2Sim(gpuContext gpu);
 extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu);
-extern void SetCalculateObcGbsaForces2_12Sim(gpuContext gpu);
-extern void GetCalculateObcGbsaForces2_12Sim(gpuContext gpu);
 extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
 extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
 extern void SetForcesSim(gpuContext gpu);
 extern void GetForcesSim(gpuContext gpu);
 extern void SetUpdateShakeHSim(gpuContext gpu);
 extern void GetUpdateShakeHSim(gpuContext gpu);
+extern void SetSettleSim(gpuContext gpu);
+extern void GetSettleSim(gpuContext gpu);
 extern void SetVerletUpdateSim(gpuContext gpu);
 extern void GetVerletUpdateSim(gpuContext gpu);
 extern void SetBrownianUpdateSim(gpuContext gpu);

--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -36,11 +36,12 @@
 #include <limits>
 #include <iostream>
 #include <stdio.h>
+#include <stdlib.h>
+#include <string>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <builtin_types.h>
 #include <vector_functions.h>
-using namespace std;

 #define RTERROR(status, s) \
    if (status != cudaSuccess) { \
@@ -228,6 +229,12 @@ static const int GT2XX_RANDOM_THREADS_PER_BLOCK         = 384;
 static const int G8X_NONBOND_WORKUNITS_PER_SM           = 220;
 static const int GT2XX_NONBOND_WORKUNITS_PER_SM         = 256;

+enum CudaNonbondedMethod
+{
+    NO_CUTOFF,
+    CUTOFF,
+    PERIODIC
+};

 struct cudaGmxSimulation {
    // Constants
@@ -236,6 +243,7 @@ struct cudaGmxSimulation {
    unsigned int    blocks;                         // Number of blocks to launch across linear kernels
    unsigned int    nonbond_blocks;                 // Number of blocks to launch across CDLJ and Born Force Part1
    unsigned int    bornForce2_blocks;              // Number of blocks to launch across Born Force 2
+    unsigned int    interaction_blocks;             // Number of blocks to launch when identifying interacting tiles
    unsigned int    threads_per_block;              // Threads per block to launch
    unsigned int    nonbond_threads_per_block;      // Threads per block in nonbond kernel calls
    unsigned int    bornForce2_threads_per_block;   // Threads per block in nonbond kernel calls
@@ -245,12 +253,17 @@ struct cudaGmxSimulation {
    unsigned int    bsf_reduce_threads_per_block;   // Threads per block in Born Sum And Forces reduction calls
    unsigned int    max_shake_threads_per_block;    // Maximum threads per block in shake kernel calls
    unsigned int    shake_threads_per_block;        // Threads per block in shake kernel calls
+    unsigned int    settle_threads_per_block;       // Threads per block in SETTLE kernel calls
    unsigned int    nonshake_threads_per_block;     // Threads per block in nonshaking kernel call
    unsigned int    max_localForces_threads_per_block;  // Threads per block in local forces kernel calls
    unsigned int    localForces_threads_per_block;  // Threads per block in local forces kernel calls
    unsigned int    random_threads_per_block;       // Threads per block in RNG kernel calls
+    unsigned int    interaction_threads_per_block;  // Threads per block when identifying interacting tiles
    unsigned int    workUnits;                      // Number of work units
    unsigned int*   pWorkUnit;                      // Pointer to work units
+    unsigned int*   pInteractingWorkUnit;           // Pointer to work units that have interactions
+    unsigned int*   pInteractionFlag;               // Flags for which work units have interactions
+    size_t*         pInteractionCount;              // A count of the number of work units which have interactions
    unsigned int    nonbond_workBlock;              // Number of work units running simultaneously per block in CDLJ and Born Force Part 1
    unsigned int    bornForce2_workBlock;           // Number of work units running second half of Born Forces calculation
    unsigned int    workUnitsPerSM;                 // Number of workblocks per SM
@@ -270,6 +283,12 @@ struct cudaGmxSimulation {
    unsigned int    outputBuffers;                  // Number of output buffers
    float           bigFloat;                       // Floating point value used as a flag for Shaken atoms 
    float           epsfac;                         // Epsilon factor for CDLJ calculations
+    CudaNonbondedMethod nonbondedMethod;            // How to handle nonbonded interactions
+    float           nonbondedCutoffSqr;             // Cutoff distance for CDLJ calculations
+    float           periodicBoxSizeX;               // The X dimension of the periodic box
+    float           periodicBoxSizeY;               // The Y dimension of the periodic box
+    float           periodicBoxSizeZ;               // The Z dimension of the periodic box
+    float           reactionFieldK;                 // Constant for reaction field correction
    float           probeRadius;                    // SASA probe radius
    float           surfaceAreaFactor;              // ACE approximation surface area factor
    float           electricConstant;               // ACE approximation electric constant
@@ -326,6 +345,7 @@ struct cudaGmxSimulation {
    float4*         pLJ14Parameter;                 // Lennard Jones 1-4 parameters
    float           inverseTotalMass;               // Used in linear momentum removal
    unsigned int    ShakeConstraints;               // Total number of Shake constraints
+    unsigned int    settleConstraints;              // Total number of Settle constraints
    unsigned int    NonShakeConstraints;            // Total number of NonShake atoms
    unsigned int    maxShakeIterations;             // Maximum shake iterations
    unsigned int    degreesOfFreedom;               // Number of degrees of freedom in system
@@ -334,12 +354,17 @@ struct cudaGmxSimulation {
    int*            pNonShakeID;                    // Not Shaking atoms
    int4*           pShakeID;                       // Shake atoms and phase
    float4*         pShakeParameter;                // Shake parameters
+    int4*           pSettleID;                      // Settle atoms
+    float2*         pSettleParameter;               // Settle parameters
    unsigned int*   pExclusion;                     // Nonbond exclusion data
    unsigned int    bond_offset;                    // Offset to end of bonds
    unsigned int    bond_angle_offset;              // Offset to end of bond angles
    unsigned int    dihedral_offset;                // Offset to end of dihedrals
    unsigned int    rb_dihedral_offset;             // Offset to end of Ryckaert Bellemans dihedrals
    unsigned int    LJ14_offset;                    // Offset to end of Lennard Jones 1-4 parameters
+    int*            pAtomIndex;                     // The original index of each atom
+    float4*         pGridBoundingBox;               // The size of each grid cell
+    float4*         pGridCenter;                    // The center of each grid cell

    // Mutable stuff
    float4*         pPosq;                          // Pointer to atom positions and charges

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -33,14 +33,20 @@
 * -------------------------------------------------------------------------- */

 #include "cudatypes.h"
+#include "cudpp.h"
 #include <vector>

 struct gpuAtomType {
-    string name;
+    std::string name;
    char symbol;
    float r;
 };

+struct gpuMoleculeGroup {
+    std::vector<int> atoms;
+    std::vector<int> instances;
+};
+
 enum SM_VERSION
 {
    SM_10,
@@ -61,8 +67,9 @@ struct _gpuContext {
    int gAtomTypes;
    cudaGmxSimulation sim;
    unsigned int* pOutputBufferCounter;
-    unsigned int* pExclusion;
+    std::vector<std::vector<int> > exclusions;
    unsigned char* pAtomSymbol;
+    std::vector<gpuMoleculeGroup> moleculeGroups;
    float iterations;
    float epsfac;
    float solventDielectric;
@@ -71,8 +78,11 @@ struct _gpuContext {
    bool bCalculateCM;
    bool bRemoveCM;
    bool bRecalculateBornRadii;
+    bool bOutputBufferPerWarp;
+    bool bIncludeGBSA;
    unsigned long seed;
    SM_VERSION sm_version;
+    CUDPPHandle cudpp;
    CUDAStream<float4>* psPosq4;
    CUDAStream<float4>* psPosqP4;
    CUDAStream<float4>* psOldPosq4;
@@ -103,15 +113,21 @@ struct _gpuContext {
    CUDAStream<int>* psNonShakeID;
    CUDAStream<int4>* psShakeID;
    CUDAStream<float4>* psShakeParameter;
+    CUDAStream<int4>* psSettleID;
+    CUDAStream<float2>* psSettleParameter;
    CUDAStream<unsigned int>* psExclusion;
    CUDAStream<unsigned int>* psWorkUnit;
+    CUDAStream<unsigned int>* psInteractingWorkUnit;
+    CUDAStream<unsigned int>* psInteractionFlag;
+    CUDAStream<size_t>* psInteractionCount;
    CUDAStream<float4>* psRandom4;          // Pointer to sets of 4 random numbers for MD integration
    CUDAStream<float2>* psRandom2;          // Pointer to sets of 2 random numbers for MD integration
    CUDAStream<uint4>* psRandomSeed;        // Pointer to each random seed
    CUDAStream<int>* psRandomPosition;      // Pointer to random number positions
    CUDAStream<float4>* psLinearMomentum;   // Pointer to total linear momentum per CTA
-   
-
+    CUDAStream<int>* psAtomIndex;           // The original index of each atom
+    CUDAStream<float4>* psGridBoundingBox;  // The size of each grid cell
+    CUDAStream<float4>* psGridCenter;       // The center and radius for each grid cell
 };

 typedef struct _gpuContext *gpuContext;
@@ -156,10 +172,10 @@ void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std::
        const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q1, const std::vector<float>& q2);

 extern "C"
-float gpuGetAtomicRadius(gpuContext gpu, string s);
+float gpuGetAtomicRadius(gpuContext gpu, std::string s);

 extern "C"
-unsigned char gpuGetAtomicSymbol(gpuContext gpu, string s);
+unsigned char gpuGetAtomicSymbol(gpuContext gpu, std::string s);

 extern "C"
 int gpuReadAtomicParameters(gpuContext gpu, char* fname);
@@ -169,7 +185,13 @@ int gpuReadCoulombParameters(gpuContext gpu, char* fname);

 extern "C"
 void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q,
-        const std::vector<char>& symbol, const std::vector<vector<int> >& exclusions);
+        const std::vector<char>& symbol, const std::vector<std::vector<int> >& exclusions, CudaNonbondedMethod method);
+
+extern "C"
+void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDielectric);
+
+extern "C"
+void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize);

 extern "C"
 void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius, const std::vector<float>& scale);
@@ -227,7 +249,7 @@ extern "C"
 int gpuBuildThreadBlockWorkList(gpuContext gpu);

 extern "C"
-int gpuBuildExclusionList(gpuContext gpu);
+void gpuBuildExclusionList(gpuContext gpu);

 extern "C"
 int gpuSetConstants(gpuContext gpu);
@@ -274,4 +296,7 @@ void gpuDumpObcInfo(gpuContext gpu);
 extern "C"
 void gpuDumpObcLoop1(gpuContext gpu); 

+extern "C"
+void gpuReorderAtoms(gpuContext gpu);
+
 #endif //__GPUTYPES_H__
--- a/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces.cu
@@ -54,15 +54,8 @@ struct Atom {
    float fx;
    float fy;
    float fz;
-    float eps2;
-    float sig2;
 };

-
-__shared__ Atom sA[G8X_NONBOND_THREADS_PER_BLOCK];
-__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
-__shared__ unsigned int sNext[GRID];
-
 static __constant__ cudaGmxSimulation cSim;

 void SetCalculateCDLJForcesSim(gpuContext gpu)
@@ -79,310 +72,102 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
 }

-__global__ void kCalculateCDLJForces_kernel()
-{
-    // Read queue of work blocks once so the remainder of
-    // kernel can run asynchronously    
-    int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
-    int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);    
-    if (threadIdx.x < end - pos)
-    {
-        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
-    }
-    if (threadIdx.x < GRID)
-    {
-        sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
-    }
-    __syncthreads();
+// Include versions of the kernels for N^2 calculations.

-    // Now change pos and end to reflect work queue just read
-    // into shared memory
-    end = end - pos; 
-    pos = end - (threadIdx.x >> GRIDBITS) - 1;
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateCDLJForces.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateCDLJForces.h"

-    while (pos >= 0)
-    {  
+// Include versions of the kernels with cutoffs.

-        // Extract cell coordinates from appropriate work unit
-        unsigned int x = sWorkUnit[pos];
-        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
-        bool bExclusionFlag = (x & 0x1);
-        x = (x >> 17) << GRIDBITS;
-        float4      apos;   // Local atom x, y, z, q
-        float3      af;     // Local atom fx, fy, fz
-        float dx; 
-        float dy; 
-        float dz; 
-        float r2; 
-        float invR; 
-        float sig; 
-        float sig2; 
-        float sig6; 
-        float eps; 
-        float dEdR;  
-        unsigned int tgx = threadIdx.x & (GRID - 1);
-        unsigned int tbx = threadIdx.x - tgx;
-        int tj = tgx; 
-        Atom* psA = &sA[tbx];
-        if (!bExclusionFlag)
-        {
-            if (x == y) // Handle diagonals uniquely at 50% efficiency
-            { 
-                // Read fixed atom data into registers and GRF
-                unsigned int i      = x + tgx;
-                apos                = cSim.pPosq[i];
-                float2 a            = cSim.pAttr[i];
-                sA[threadIdx.x].x   = apos.x;
-                sA[threadIdx.x].y   = apos.y;
-                sA[threadIdx.x].z   = apos.z;
-                sA[threadIdx.x].q   = apos.w;
-                sA[threadIdx.x].sig = a.x;
-                sA[threadIdx.x].eps = a.y;
-                af.x                = 0.0f;
-                af.y                = 0.0f;
-                af.z                = 0.0f;
-                apos.w             *= cSim.epsfac;
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    dx              = psA[j].x - apos.x; 
-                    dy              = psA[j].y - apos.y; 
-                    dz              = psA[j].z - apos.z; 
-                    r2              = dx * dx + dy * dy + dz * dz; 
-                    invR            = 1.0f / sqrt(r2);
-                    sig             = a.x + psA[j].sig; 
-                    sig2            = invR * sig; 
-                    sig2           *= sig2;
-                    sig6            = sig2 * sig2 * sig2; 
-                    eps             = a.y * psA[j].eps; 
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR           += apos.w * psA[j].q * invR; 
-                    dEdR           *= invR * invR; 
-                    dx             *= dEdR; 
-                    dy             *= dEdR; 
-                    dz             *= dEdR; 
-                    af.x           -= dx; 
-                    af.y           -= dy; 
-                    af.z           -= dz; 
-                }
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_CUTOFF
+#define METHOD_NAME(a, b) a##Cutoff##b
+#include "kCalculateCDLJForces.h"
+#include "kFindInteractingBlocks.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
+#include "kCalculateCDLJForces.h"

-                // Write results
-                float4 of;
-                of.x                                = af.x;
-                of.y                                = af.y;
-                of.z                                = af.z;
-                of.w                                = 0.0f;
-                int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]               = of;
-            }         
-            else        // 100% utilization
-            {
-                // Read fixed atom data into registers and GRF
-                int j                   = y + tgx;
-                unsigned int i          = x + tgx;
-                float4 temp             = cSim.pPosq[j];
-                float2 temp1            = cSim.pAttr[j];
-                apos                    = cSim.pPosq[i];
-                float2 a                = cSim.pAttr[i];
-                sA[threadIdx.x].x       = temp.x;
-                sA[threadIdx.x].y       = temp.y;
-                sA[threadIdx.x].z       = temp.z;
-                sA[threadIdx.x].q       = temp.w;
-                sA[threadIdx.x].sig     = temp1.x;
-                sA[threadIdx.x].eps     = temp1.y;
-                sA[threadIdx.x].fx      = af.x = 0.0f;
-                sA[threadIdx.x].fy      = af.y = 0.0f;
-                sA[threadIdx.x].fz      = af.z = 0.0f;
-                sA[threadIdx.x].sig2    = a.x;
-                sA[threadIdx.x].eps2    = a.y;
-                apos.w                 *= cSim.epsfac;
+// Include versions of the kernels with periodic boundary conditions.

-                for (j = 0; j < GRID; j++)
-                {
-                    dx              = psA[tj].x - apos.x; 
-                    dy              = psA[tj].y - apos.y; 
-                    dz              = psA[tj].z - apos.z; 
-                    r2              = dx * dx + dy * dy + dz * dz; 
-                    invR            = 1.0f / sqrt(r2);
-                    sig             = a.x + psA[tj].sig; 
-                    sig2            = invR * sig; 
-                    sig2           *= sig2;
-                    sig6            = sig2 * sig2 * sig2; 
-                    eps             = a.y * psA[tj].eps; 
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR           += apos.w * psA[tj].q * invR; 
-                    dEdR           *= invR * invR; 
-                    dx             *= dEdR; 
-                    dy             *= dEdR; 
-                    dz             *= dEdR; 
-                    af.x           -= dx; 
-                    af.y           -= dy; 
-                    af.z           -= dz; 
-                    psA[tj].fx     += dx; 
-                    psA[tj].fy     += dy; 
-                    psA[tj].fz     += dz;
-                    tj              = sNext[tj]; 
-                }
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_PERIODIC
+#define METHOD_NAME(a, b) a##Periodic##b
+#include "kCalculateCDLJForces.h"
+#include "kFindInteractingBlocks.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##PeriodicByWarp##b
+#include "kCalculateCDLJForces.h"

-                // Write results
-                float4 of;
-                of.x                                = af.x;
-                of.y                                = af.y;
-                of.z                                = af.z;
-                of.w                                = 0.0f;
-                int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]               = of;
-                of.x                                = sA[threadIdx.x].fx;
-                of.y                                = sA[threadIdx.x].fy;
-                of.z                                = sA[threadIdx.x].fz;
-                offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]               = of;
-            }
-        }
-        else  // bExclusion
-        {
-            // Read exclusion data

-            if (x == y) // Handle diagonals uniquely at 50% efficiency
-            { 
-                // Read fixed atom data into registers and GRF
-                unsigned int excl       = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];                          
-                unsigned int i          = x + tgx;
-                apos                    = cSim.pPosq[i];
-                float2 a                = cSim.pAttr[i];
-                sA[threadIdx.x].x       = apos.x;
-                sA[threadIdx.x].y       = apos.y;
-                sA[threadIdx.x].z       = apos.z;
-                sA[threadIdx.x].q       = apos.w;
-                sA[threadIdx.x].sig     = a.x;
-                sA[threadIdx.x].eps     = a.y;
-                af.x                    = 0.0f;
-                af.y                    = 0.0f;
-                af.z                    = 0.0f;
-                sA[threadIdx.x].sig2    = a.x;
-                sA[threadIdx.x].eps2    = a.y;
-                apos.w                 *= cSim.epsfac;
+__global__ extern void kCalculateCDLJCutoffForces_12_kernel();

-                for (unsigned int j = 0; j < GRID; j++)
+void kCalculateCDLJForces(gpuContext gpu)
+{
+//    printf("kCalculateCDLJCutoffForces\n");
+    CUDPPResult result;
+    size_t numWithInteractions;
+    switch (gpu->sim.nonbondedMethod)
    {
-                    dx              = psA[j].x - apos.x; 
-                    dy              = psA[j].y - apos.y; 
-                    dz              = psA[j].z - apos.z; 
-                    r2              = dx * dx + dy * dy + dz * dz; 
-                    invR            = 1.0f / sqrt(r2);
-                    sig             = psA[tgx].sig2 + psA[j].sig; 
-                    sig2            = invR * sig; 
-                    sig2           *= sig2;
-                    sig6            = sig2 * sig2 * sig2; 
-                    eps             = psA[tgx].eps2 * psA[j].eps; 
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR           += apos.w * psA[j].q * invR; 
-                    dEdR           *= invR * invR; 
-                    if (!(excl & 0x1))
+        case NO_CUTOFF:
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCDLJN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
+            else
+                kCalculateCDLJN2Forces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
+            LAUNCHERROR("kCalculateCDLJN2Forces");
+            break;
+        case CUTOFF:
+            kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsCutoff");
+            kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
+            result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
+                    gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
+            if (result != CUDPP_SUCCESS)
            {
-                        dEdR = 0.0f;
-                    }
-                    dx             *= dEdR; 
-                    dy             *= dEdR; 
-                    dz             *= dEdR; 
-                    af.x           -= dx; 
-                    af.y           -= dy; 
-                    af.z           -= dz;
-                    excl          >>= 1;               
-                }
-                
-                // Write results
-                float4 of;
-                of.x                                = af.x;
-                of.y                                = af.y;
-                of.z                                = af.z;
-                of.w                                = 0.0f;
-                int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]               = of;
+                printf("Error in cudppCompact: %d\n", result);
+                exit(-1);
            }
-            else        // 100% utilization
-            {
-                // Read fixed atom data into registers and GRF        
-                unsigned int excl       = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
-                excl                    = (excl >> tgx) | (excl << (GRID - tgx));
-                int j                   = y + tgx;
-                unsigned int i          = x + tgx;
-                float4 temp             = cSim.pPosq[j];
-                float2 temp1            = cSim.pAttr[j];
-                apos                    = cSim.pPosq[i];
-                float2 a                = cSim.pAttr[i];
-                sA[threadIdx.x].x       = temp.x;
-                sA[threadIdx.x].y       = temp.y;
-                sA[threadIdx.x].z       = temp.z;
-                sA[threadIdx.x].q       = temp.w;
-                sA[threadIdx.x].sig     = temp1.x;
-                sA[threadIdx.x].eps     = temp1.y;
-                sA[threadIdx.x].fx      = af.x = 0.0f;
-                sA[threadIdx.x].fy      = af.y = 0.0f;
-                sA[threadIdx.x].fz      = af.z = 0.0f;
-                sA[threadIdx.x].sig2    = a.x;
-                sA[threadIdx.x].eps2    = a.y;
-                apos.w                 *= cSim.epsfac;
-                
-                for (j = 0; j < GRID; j++)
-                {
-                    dx              = psA[tj].x - apos.x; 
-                    dy              = psA[tj].y - apos.y; 
-                    dz              = psA[tj].z - apos.z; 
-                    r2              = dx * dx + dy * dy + dz * dz; 
-                    invR            = 1.0f / sqrt(r2);
-                    sig             = psA[tgx].sig2 + psA[tj].sig; 
-                    sig2            = invR * sig; 
-                    sig2           *= sig2;
-                    sig6            = sig2 * sig2 * sig2; 
-                    eps             = psA[tgx].eps2 * psA[tj].eps; 
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR           += apos.w * psA[tj].q * invR; 
-                    dEdR           *= invR * invR; 
-                    if (!(excl & 0x1))
+            gpu->psInteractionCount->Download();
+            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCDLJCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            else
+                kCalculateCDLJCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            LAUNCHERROR("kCalculateCDLJCutoffForces");
+            break;
+        case PERIODIC:
+            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
+            LAUNCHERROR("kFindBlockBoundsPeriodic");
+            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
+            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
+            result = cudppCompact(gpu->cudpp, gpu->sim.pInteractingWorkUnit, gpu->sim.pInteractionCount,
+                    gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits);
+            if (result != CUDPP_SUCCESS)
            {
-                        dEdR = 0.0f;
-                    }
-                    dx             *= dEdR; 
-                    dy             *= dEdR; 
-                    dz             *= dEdR; 
-                    af.x           -= dx; 
-                    af.y           -= dy; 
-                    af.z           -= dz; 
-                    psA[tj].fx     += dx; 
-                    psA[tj].fy     += dy; 
-                    psA[tj].fz     += dz;
-                    excl          >>= 1;
-                    tj              = sNext[tj]; 
+                printf("Error in cudppCompact: %d\n", result);
+                exit(-1);
            }
-                
-                // Write results
-                float4 of;
-                of.x                                = af.x;
-                of.y                                = af.y;
-                of.z                                = af.z;
-                of.w                                = 0.0f;
-                int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]               = of;
-                of.x                                = sA[threadIdx.x].fx;
-                of.y                                = sA[threadIdx.x].fy;
-                of.z                                = sA[threadIdx.x].fz;
-                offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]               = of;
-            }
-        }
-
-        pos -= cSim.nonbond_workBlock;     
-    }
-}
-
-__global__ extern void kCalculateCDLJForces_12_kernel();
-
-void kCalculateCDLJForces(gpuContext gpu)
-{
-//    printf("kCalculateCDLJForces\n");
-    if (gpu->sm_version < SM_12)
-        kCalculateCDLJForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+            gpu->psInteractionCount->Download();
+            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
            else
-        kCalculateCDLJForces_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
-    LAUNCHERROR("kCalculateCDLJForces");
+                kCalculateCDLJPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            LAUNCHERROR("kCalculateCDLJPeriodicForces");
+    }
 }
\ No newline at end of file
--- a/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
--- a/platforms/cuda/src/kernels/kCalculateLocalForces.cu
+++ b/platforms/cuda/src/kernels/kCalculateLocalForces.cu
@@ -440,6 +440,8 @@ __global__ void kCalculateLocalForces_kernel()
        pos += blockDim.x * gridDim.x;
    }   

+    if (cSim.nonbondedMethod == NO_CUTOFF)
+    {
        while (pos < cSim.LJ14_offset)
        {
            unsigned int pos1       = pos - cSim.rb_dihedral_offset;
@@ -483,6 +485,110 @@ __global__ void kCalculateLocalForces_kernel()
            }
            pos                    += blockDim.x * gridDim.x;
        }
+    }
+    else if (cSim.nonbondedMethod == CUTOFF)
+    {
+        while (pos < cSim.LJ14_offset)
+        {
+            unsigned int pos1       = pos - cSim.rb_dihedral_offset;
+            if (pos1 < cSim.LJ14s)
+            {
+                int4 atom               = cSim.pLJ14ID[pos1];
+                float4 LJ14             = cSim.pLJ14Parameter[pos1];
+                float4 a1               = cSim.pPosq[atom.x];
+                float4 a2               = cSim.pPosq[atom.y];
+                float3 d;
+                d.x                     = a1.x - a2.x;
+                d.y                     = a1.y - a2.y;
+                d.z                     = a1.z - a2.z;
+                float r2                = DOT3(d, d);
+                float inverseR          = 1.0f / sqrt(r2);
+                float sig2              = inverseR * LJ14.y;
+                sig2                   *= sig2;
+                float sig6              = sig2 * sig2 * sig2;
+                float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
+                dEdR                   += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
+                dEdR                   *= inverseR * inverseR;
+                if (r2 > cSim.nonbondedCutoffSqr)
+                {
+                    dEdR = 0.0f;
+                }
+                unsigned int offsetA    = atom.x + atom.z * cSim.stride;
+                unsigned int offsetB    = atom.y + atom.w * cSim.stride;
+                float4 forceA           = {0.0f, 0.0f, 0.0f, 0.0f};
+                if (atom.z < cSim.totalNonbondOutputBuffers)
+                    forceA              = cSim.pForce4[offsetA];
+                float4 forceB           = {0.0f, 0.0f, 0.0f, 0.0f};
+                if (atom.w < cSim.totalNonbondOutputBuffers)
+                    forceB              = cSim.pForce4[offsetB];
+                d.x                    *= dEdR;
+                d.y                    *= dEdR;
+                d.z                    *= dEdR;
+                forceA.x               += d.x;
+                forceA.y               += d.y;
+                forceA.z               += d.z;
+                forceB.x               -= d.x;
+                forceB.y               -= d.y;
+                forceB.z               -= d.z;
+                cSim.pForce4[offsetA]   = forceA;
+                cSim.pForce4[offsetB]   = forceB;
+            }
+            pos                    += blockDim.x * gridDim.x;
+        }
+    }
+    else if (cSim.nonbondedMethod == PERIODIC)
+    {
+        while (pos < cSim.LJ14_offset)
+        {
+            unsigned int pos1       = pos - cSim.rb_dihedral_offset;
+            if (pos1 < cSim.LJ14s)
+            {
+                int4 atom               = cSim.pLJ14ID[pos1];
+                float4 LJ14             = cSim.pLJ14Parameter[pos1];
+                float4 a1               = cSim.pPosq[atom.x];
+                float4 a2               = cSim.pPosq[atom.y];
+                float3 d;
+                d.x                     = a1.x - a2.x;
+                d.y                     = a1.y - a2.y;
+                d.z                     = a1.z - a2.z;
+                d.x                     -= floor(d.x/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                d.y                     -= floor(d.x/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                d.z                     -= floor(d.x/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+                float r2                = DOT3(d, d);
+                float inverseR          = 1.0f / sqrt(r2);
+                float sig2              = inverseR * LJ14.y;
+                sig2                   *= sig2;
+                float sig6              = sig2 * sig2 * sig2;
+                float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
+                dEdR                   += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
+                dEdR                   *= inverseR * inverseR;
+                if (r2 > cSim.nonbondedCutoffSqr)
+                {
+                    dEdR = 0.0f;
+                }
+                unsigned int offsetA    = atom.x + atom.z * cSim.stride;
+                unsigned int offsetB    = atom.y + atom.w * cSim.stride;
+                float4 forceA           = {0.0f, 0.0f, 0.0f, 0.0f};
+                if (atom.z < cSim.totalNonbondOutputBuffers)
+                    forceA              = cSim.pForce4[offsetA];
+                float4 forceB           = {0.0f, 0.0f, 0.0f, 0.0f};
+                if (atom.w < cSim.totalNonbondOutputBuffers)
+                    forceB              = cSim.pForce4[offsetB];
+                d.x                    *= dEdR;
+                d.y                    *= dEdR;
+                d.z                    *= dEdR;
+                forceA.x               += d.x;
+                forceA.y               += d.y;
+                forceA.z               += d.z;
+                forceB.x               -= d.x;
+                forceB.y               -= d.y;
+                forceB.z               -= d.z;
+                cSim.pForce4[offsetA]   = forceA;
+                cSim.pForce4[offsetB]   = forceB;
+            }
+            pos                    += blockDim.x * gridDim.x;
+        }
+    }

 }


--- a/platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.cu
@@ -53,10 +53,6 @@ struct Atom {
    float junk;
 };

-__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
-__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
-__shared__ unsigned int sNext[GRID];
-
 static __constant__ cudaGmxSimulation cSim;

 void SetCalculateObcGbsaBornSumSim(gpuContext gpu)
@@ -73,6 +69,50 @@ void GetCalculateObcGbsaBornSumSim(gpuContext gpu)
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
 }

+// Include versions of the kernels for N^2 calculations.
+
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateObcGbsaBornSum.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateObcGbsaBornSum.h"
+
+// Include versions of the kernels with cutoffs.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_CUTOFF
+#define METHOD_NAME(a, b) a##Cutoff##b
+#include "kCalculateObcGbsaBornSum.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
+#include "kCalculateObcGbsaBornSum.h"
+
+// Include versions of the kernels with periodic boundary conditions.
+
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_PERIODIC
+#define METHOD_NAME(a, b) a##Periodic##b
+#include "kCalculateObcGbsaBornSum.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##PeriodicByWarp##b
+#include "kCalculateObcGbsaBornSum.h"
+
+
+__global__ void kClearObcGbsaBornSum_kernel()
+{
+    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
+    while (pos < cSim.stride * cSim.nonbondOutputBuffers)
+    {
+        ((float*)cSim.pBornSum)[pos] = 0.0f;
+        pos += gridDim.x * blockDim.x;
+    }
+}
+
 __global__ void kReduceObcGbsaBornSum_kernel()
 {
    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
@@ -127,175 +167,40 @@ if( 0 ){
    LAUNCHERROR("kReduceObcGbsaBornSum");
 }

-
-__global__ void kCalculateObcGbsaBornSum_kernel()
-{
-    // Read queue of work blocks once so the remainder of
-    // kernel can run asynchronously    
-    int pos = (blockIdx.x * cSim.workUnits) / gridDim.x;
-    int end = ((blockIdx.x + 1) * cSim.workUnits) / gridDim.x;
-    if (threadIdx.x < end - pos)
-    {
-        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
-    }
-    if (threadIdx.x < GRID)
-    {
-        sNext[threadIdx.x] = (threadIdx.x - 1) & (GRID - 1);
-    }
-    __syncthreads();
-
-    // Now change pos and end to reflect work queue just read
-    // into shared memory
-    end = end - pos; 
-    pos = end - (threadIdx.x >> GRIDBITS) - 1;
-       
-    while (pos >= 0)
-    {  
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x = sWorkUnit[pos];
-        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
-        x = (x >> 17) << GRIDBITS;
-        float       dx; 
-        float       dy; 
-        float       dz; 
-        float       r2; 
-        float       r;
-
-        unsigned int tgx = threadIdx.x & (GRID - 1);
-        unsigned int tbx = threadIdx.x - tgx;
-        int tj = tgx; 
-        Atom* psA = &sA[tbx];
-     
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        { 
-            // Read fixed atom data into registers and GRF       
-            unsigned int i = x + tgx;
-            float4 apos = cSim.pPosq[i];    // Local atom x, y, z, sum
-            float2 ar = cSim.pObcData[i];   // Local atom vr, sr
-            sA[threadIdx.x].x           = apos.x;
-            sA[threadIdx.x].y           = apos.y;
-            sA[threadIdx.x].z           = apos.z;
-            sA[threadIdx.x].r           = ar.x;
-            sA[threadIdx.x].sr          = ar.y;
-            apos.w                      = 0.0f;
-
-            for (unsigned int j = 0; j < GRID; j++)
-            {
-                dx                      = psA[j].x - apos.x;
-                dy                      = psA[j].y - apos.y;
-                dz                      = psA[j].z - apos.z;
-                r2                      = dx * dx + dy * dy + dz * dz; 
-                r                       = sqrt(r2);
-                float rInverse          = 1.0f / r; 
-                float rScaledRadiusJ    = r + psA[j].sr;
-                if ((j != tgx) && (ar.x < rScaledRadiusJ))
-                {
-                    float l_ij     = 1.0f / max(ar.x, fabs(r - psA[j].sr));
-                    float u_ij     = 1.0f / rScaledRadiusJ;
-                    float l_ij2    = l_ij * l_ij;
-                    float u_ij2    = u_ij * u_ij;
-                    float ratio    = log(u_ij / l_ij);
-                    apos.w        += l_ij - 
-                                     u_ij + 
-                                     0.25f * r * (u_ij2 - l_ij2) + 
-                                     (0.50f * rInverse * ratio) + 
-                                     (0.25f * psA[j].sr * psA[j].sr * rInverse) *
-                                     (l_ij2 - u_ij2);
-                                                                                                              
-                    if (ar.x < (psA[j].r - r))
-                    {
-                        apos.w += 2.0f * ((1.0f / ar.x) - l_ij);
-                    }
-                }
-            }             
-
-            // Write results
-            int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = apos.w;
-        }         
-        else        // 100% utilization
-        {
-            // Read fixed atom data into registers and GRF
-            int j                           = y + tgx;
-            unsigned int i                  = x + tgx;      
-            
-            float4 temp                     = cSim.pPosq[j];
-            float2 temp1                    = cSim.pObcData[j];
-            float4 apos                     = cSim.pPosq[i];        // Local atom x, y, z, sum
-            float2 ar                       = cSim.pObcData[i];    // Local atom vr, sr
-            sA[threadIdx.x].x               = temp.x;
-            sA[threadIdx.x].y               = temp.y;
-            sA[threadIdx.x].z               = temp.z;
-            sA[threadIdx.x].r               = temp1.x;
-            sA[threadIdx.x].sr              = temp1.y;
-            sA[threadIdx.x].sum = apos.w    = 0.0f;
-
-            for (unsigned int j = 0; j < GRID; j++)
-            {
-                dx                      = psA[tj].x - apos.x; 
-                dy                      = psA[tj].y - apos.y; 
-                dz                      = psA[tj].z - apos.z; 
-                r2                      = dx * dx + dy * dy + dz * dz; 
-                r                       = sqrt(r2);
-                float rInverse          = 1.0f / r; 
-                float rScaledRadiusJ    = r + psA[tj].sr;
-                if (ar.x < rScaledRadiusJ)
-                {
-                    float l_ij     = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
-                    float u_ij     = 1.0f / rScaledRadiusJ;
-                    float l_ij2    = l_ij * l_ij;
-                    float u_ij2    = u_ij * u_ij;
-                    float ratio    = log(u_ij / l_ij);
-                    float term     = l_ij - 
-                                     u_ij + 
-                                     0.25f * r * (u_ij2 - l_ij2) + 
-                                     (0.50f * rInverse * ratio) + 
-                                     (0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
-                                     (l_ij2 - u_ij2);
-                    if (ar.x < (psA[tj].sr - r))
-                    {
-                        term += 2.0f * ((1.0f / ar.x) - l_ij);
-                    }
-                    apos.w        += term;
-                }
-                float rScaledRadiusI    = r + ar.y;
-                if (psA[tj].r < rScaledRadiusI)
-                {
-                    float l_ij     = 1.0f / max(psA[tj].r, fabs(r - ar.y));
-                    float u_ij     = 1.0f / rScaledRadiusI;
-                    float l_ij2    = l_ij * l_ij;
-                    float u_ij2    = u_ij * u_ij;
-                    float ratio    = log(u_ij / l_ij);
-                    float term     = l_ij - 
-                                     u_ij + 
-                                     0.25f * r * (u_ij2 - l_ij2) + 
-                                     (0.50f * rInverse * ratio) + 
-                                     (0.25f * ar.y * ar.y * rInverse) *
-                                     (l_ij2 - u_ij2);
- 
-                    if (psA[tj].r < (ar.y - r))
-                    {
-                        term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
-                    }
-                    psA[tj].sum    += term;
-                }      
-                tj = sNext[tj];
-            }    
-                
-            // Write results
-            int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = apos.w;
-            offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = sA[threadIdx.x].sum;
-        }       
-       
-        pos -= cSim.nonbond_workBlock;     
-    }
-}
-
 void kCalculateObcGbsaBornSum(gpuContext gpu)
 {
  //  printf("kCalculateObcgbsaBornSum\n");
-    kCalculateObcGbsaBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
+    kClearObcGbsaBornSum_kernel<<<gpu->sim.blocks, 384>>>();
+    LAUNCHERROR("kClearBornSum");
+    size_t numWithInteractions;
+    switch (gpu->sim.nonbondedMethod)
+    {
+        case NO_CUTOFF:
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
+            else
+                kCalculateObcGbsaN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
+            break;
+        case CUTOFF:
+            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            else
+                kCalculateObcGbsaCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            break;
+        case PERIODIC:
+            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            else
+                kCalculateObcGbsaPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
+                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            break;
+    }
    LAUNCHERROR("kCalculateBornSum");
 }
--- a/platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaForces2.cu
@@ -52,18 +52,9 @@ struct Atom {
    float fy;
    float fz;
    float fb;
-//    float sum;
-//    float oneOverR;
-    int pos;
-    int wx;
-    int wy;
 };


-__shared__ Atom sA[G8X_BORNFORCE2_THREADS_PER_BLOCK];
-__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
-__shared__ unsigned int sNext[GRID];
-
 static __constant__ cudaGmxSimulation cSim;

 void SetCalculateObcGbsaForces2Sim(gpuContext gpu)
@@ -80,283 +71,72 @@ void GetCalculateObcGbsaForces2Sim(gpuContext gpu)
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
 }

-__global__ void kCalculateObcGbsaForces2_kernel()
-{
-    // Read queue of work blocks once so the remainder of
-    // kernel can run asynchronously    
-    int pos = cSim.bf2WorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.bf2WorkUnitsPerBlockRemainder);
-    int end = cSim.bf2WorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.bf2WorkUnitsPerBlockRemainder);    
-    if (threadIdx.x < end - pos)
-    {
-        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
-    }
-    if (threadIdx.x < GRID)
-    {
-        sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
-    }
-    __syncthreads();
-
-    // Now change pos and end to reflect work queue just read
-    // into shared memory
-    end = end - pos; 
-    sA[threadIdx.x].pos = end - (threadIdx.x >> GRIDBITS) - 1;
-       
-    while (sA[threadIdx.x].pos >= 0)
-    {  
-    
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x = sWorkUnit[sA[threadIdx.x].pos];
-        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
-        x = (x >> 17) << GRIDBITS;
-        unsigned int tgx                = threadIdx.x & (GRID - 1);
-        unsigned int i                  = x + tgx;
-        float4 apos                     = cSim.pPosq[i];
-        float2 a                        = cSim.pObcData[i];
-        float fb                        = cSim.pBornForce[i];
-        unsigned int tbx                = threadIdx.x - tgx;
-        int tj                          = tgx; 
-        Atom* psA                       = &sA[tbx];
-        sA[threadIdx.x].wx              = x;
-        sA[threadIdx.x].wy              = y; 
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        { 
-            // Read fixed atom data into registers and GRF
-            float3 af;
-            sA[threadIdx.x].fx = af.x   = 0.0f;
-            sA[threadIdx.x].fy = af.y   = 0.0f;
-            sA[threadIdx.x].fz = af.z   = 0.0f;
-//            float sum                   = 0.0f;
-            sA[threadIdx.x].x           = apos.x;
-            sA[threadIdx.x].y           = apos.y;
-            sA[threadIdx.x].z           = apos.z;
-//            float oneOverR              = 1.0f / a.x;
-            sA[threadIdx.x].r           = a.x;
-            sA[threadIdx.x].sr          = a.y;
-            sA[threadIdx.x].sr2         = a.y * a.y;
-            sA[threadIdx.x].fb          = fb;
-
-            for (unsigned int j = sNext[tgx]; j != tgx; j = sNext[j])
-            {
-                float dx                = psA[j].x - apos.x; 
-                float dy                = psA[j].y - apos.y; 
-                float dz                = psA[j].z - apos.z; 
-                float r2                = dx * dx + dy * dy + dz * dz;
-                float r                 = sqrt(r2);
-                
-                // Atom I Born forces and sum
-                float rScaledRadiusJ    = r + psA[j].sr; 
-                float l_ij              = 1.0f / max(a.x, fabs(r - psA[j].sr));
-                float u_ij              = 1.0f / rScaledRadiusJ;
-                float rInverse          = 1.0f / r;
-                float l_ij2             = l_ij * l_ij;
-                float u_ij2             = u_ij * u_ij; 
-                float r2Inverse         = rInverse * rInverse;                   
-                float t1                = log (u_ij / l_ij);
-                float t2                = (l_ij2 - u_ij2);
-                float t3                = t2 * rInverse;
-                t1                     *= rInverse;
-                
-                // Born Forces term
-                float term              =  0.125f * 
-                                          (1.000f + psA[j].sr2 * r2Inverse) * t3 + 
-                                           0.250f * t1 * r2Inverse;
-                float dE                = fb * term;
-                
-                // Born sum term
-//                term                    =   l_ij - u_ij  +
-//                                           -0.25f * r * t2 +
-//                                            0.50f * t1 +
-//                                           (0.25f * psA[j].sr2) * t3;
-//                if (a.x < (psA[j].sr - r))
-//                {
-//                    term               += 2.0f * (oneOverR - l_ij);
-//                }
-                if (a.x >= rScaledRadiusJ)
-                {
-                    dE                  = /*term =*/ 0.0f;
-                }
-                
-                float d                 = dx * dE;
-                af.x                   -= d;
-                psA[j].fx              += d;
-                d                       = dy * dE;  
-                af.y                   -= d;
-                psA[j].fy              += d;
-                d                       = dz * dE;
-                af.z                   -= d;
-                psA[j].fz              += d; 
-//                sum                    += term;
-            }
-            
-            // Write results
-            int offset                  = x + tgx + (x >> GRIDBITS) * cSim.stride;
-            float4 of;
-            of.x                        = af.x + sA[threadIdx.x].fx;
-            of.y                        = af.y + sA[threadIdx.x].fy;
-            of.z                        = af.z + sA[threadIdx.x].fz;
-            of.w                        = 0.0f;
-            cSim.pForce4b[offset]       = of;
-//            cSim.pBornSum[offset]       = sum;
-        }         
-        else 
-        {        
-            // Read fixed atom data into registers and GRF
-            int j                       = y + tgx;
-            float4 temp                 = cSim.pPosq[j];
-            float2 temp1                = cSim.pObcData[j];
-            sA[threadIdx.x].fb          = cSim.pBornForce[j];
-            float3 af;
-            sA[threadIdx.x].fx = af.x   = 0.0f;
-            sA[threadIdx.x].fy = af.y   = 0.0f;
-            sA[threadIdx.x].fz = af.z   = 0.0f;
-//            sA[threadIdx.x].sum         = 0.0f;
-//            float sum                   = 0.0f;
-            float sr2                   = a.y * a.y;
-            sA[threadIdx.x].x           = temp.x;
-            sA[threadIdx.x].y           = temp.y;
-            sA[threadIdx.x].z           = temp.z;
-            sA[threadIdx.x].r           = temp1.x;
-            sA[threadIdx.x].sr          = temp1.y;
-            sA[threadIdx.x].sr2         = temp1.y * temp1.y;
-//            sA[threadIdx.x].oneOverR    = 1.0f / temp1.x;
-
-            for (j = 0; j < GRID; j++)
-            {
-                float dx                = psA[tj].x - apos.x; 
-                float dy                = psA[tj].y - apos.y; 
-                float dz                = psA[tj].z - apos.z; 
-                float r2                = dx * dx + dy * dy + dz * dz; 
-                float r                 = sqrt(r2);
-                
-                // Atom I Born Forces and sum
-                float r2Inverse         = 1.0f / r2;
-                float rScaledRadiusJ    = r + psA[tj].sr;
-                float rInverse          = 1.0f / r;
-                
-                
-                float l_ij              = 1.0f / max(a.x, fabs(r - psA[tj].sr));
-                float u_ij              = 1.0f / rScaledRadiusJ;
-                float l_ij2             = l_ij * l_ij;
-                float u_ij2             = u_ij * u_ij;
-                float t1                = log (u_ij / l_ij);
-                float t2                = (l_ij2 - u_ij2);
-                float t3                = t2 * rInverse;
-                t1                     *= rInverse;
-                   
-                // Born Forces term
-                float term              =  0.125f * 
-                                          (1.000f + psA[tj].sr2 * r2Inverse) * t3 + 
-                                           0.250f * t1 * r2Inverse;
-                float dE                = fb * term;
-                 // Born sum term
-//                term                    =   l_ij - u_ij  +
-//                                           -0.25f * r * t2 +
-//                                            0.50f * t1 +
-//                                           (0.25f * psA[tj].sr2) * t3;
-//                if (a.x < (psA[tj].sr - r))
-//                {
-//                    term               += 2.0f * ((1.0f / a.x) - l_ij);
-//                }
-                if (a.x >= rScaledRadiusJ) 
-                {
-                    dE                  = /*term =*/ 0.0f;
-                }

+// Include versions of the kernels for N^2 calculations.

-                float d                 = dx * dE;
-                af.x                   -= d;
-                psA[tj].fx             += d;
-                d                       = dy * dE;  
-                af.y                   -= d;
-                psA[tj].fy             += d;
-                d                       = dz * dE;
-                af.z                   -= d;
-                psA[tj].fz             += d;    
-//                sum                    += term;
+#define METHOD_NAME(a, b) a##N2##b
+#include "kCalculateObcGbsaForces2.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##N2ByWarp##b
+#include "kCalculateObcGbsaForces2.h"

-                // Atom J Born Forces and sum               
-                float rScaledRadiusI    = r + a.y;
-                l_ij                    = 1.0f / max(psA[tj].r, fabs(r - a.y));
-                u_ij                    = 1.0f / rScaledRadiusI;
-                l_ij2                   = l_ij * l_ij;
-                u_ij2                   = u_ij * u_ij;
-                t1                      = log (u_ij / l_ij);
-                t2                      = (l_ij2 - u_ij2);
-                t3                      = t2 * rInverse;
-                t1                     *= rInverse;
+// Include versions of the kernels with cutoffs.

-                // Born Forces term
-                term                    =  0.125f * 
-                                          (1.000f + sr2 * r2Inverse) * t3 + 
-                                           0.250f * t1 * r2Inverse;
-                dE                      = psA[tj].fb * term;  
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_CUTOFF
+#define METHOD_NAME(a, b) a##Cutoff##b
+#include "kCalculateObcGbsaForces2.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##CutoffByWarp##b
+#include "kCalculateObcGbsaForces2.h"

-                // Born sum term
-//                term                    =   l_ij - u_ij  +
-//                                           -0.25f * r * t2 +
-//                                            0.50f * t1 +
-//                                           (0.25f * sr2) * t3;
-//
-//                if (psA[tj].r < (a.y - r))
-//                {
-//                    term               +=  2.0f * (psA[tj].oneOverR - l_ij);
-//                }
-                if (psA[tj].r >= rScaledRadiusI) 
-                {
-                    dE                  = /*term =*/ 0.0f;
-                }                                    
-                dx                     *= dE;
-                dy                     *= dE;
-                dz                     *= dE;
-                psA[tj].fx             += dx; 
-                psA[tj].fy             += dy;
-                psA[tj].fz             += dz; 
-                af.x                   -= dx;
-                af.y                   -= dy;
-                af.z                   -= dz;
-//                psA[tj].sum            +=  term;
+// Include versions of the kernels with periodic boundary conditions.

-                tj                      = sNext[tj]; 
-            }
-                
-            // Write results
-            int offset                  = sA[threadIdx.x].wx + tgx + (sA[threadIdx.x].wy >> GRIDBITS) * cSim.stride;
-            float4 of;
-            of.x                        = af.x;
-            of.y                        = af.y;
-            of.z                        = af.z;
-            of.w                        = 0.0f;
-            cSim.pForce4b[offset]       = of;
-//            cSim.pBornSum[offset]       = sum;
-            offset                      = sA[threadIdx.x].wy + tgx + (sA[threadIdx.x].wx >> GRIDBITS) * cSim.stride;
-            of.x                        = sA[threadIdx.x].fx;
-            of.y                        = sA[threadIdx.x].fy;
-            of.z                        = sA[threadIdx.x].fz;
-            cSim.pForce4b[offset]       = of;
-//            cSim.pBornSum[offset]       = sA[threadIdx.x].sum;
-        }
-        sA[threadIdx.x].pos            -= cSim.bornForce2_workBlock;     
-    }
-}
-
-__global__ extern void kCalculateObcGbsaForces2_12_kernel();
+#undef METHOD_NAME
+#undef USE_OUTPUT_BUFFER_PER_WARP
+#define USE_PERIODIC
+#define METHOD_NAME(a, b) a##Periodic##b
+#include "kCalculateObcGbsaForces2.h"
+#define USE_OUTPUT_BUFFER_PER_WARP
+#undef METHOD_NAME
+#define METHOD_NAME(a, b) a##PeriodicByWarp##b
+#include "kCalculateObcGbsaForces2.h"

 void kCalculateObcGbsaForces2(gpuContext gpu)
 {
    //printf("kCalculateObcGbsaForces2\n");
-    if (gpu->sm_version < SM_12)
-        kCalculateObcGbsaForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>();
+    size_t numWithInteractions;
+    switch (gpu->sim.nonbondedMethod)
+    {
+        case NO_CUTOFF:
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
            else
-        kCalculateObcGbsaForces2_12_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>();
-if( 0 ){
-   static int step = 0;
-	//int numPrint    = -1;
-	step++;
-	//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
-	//gpuDumpCoordinates( gpu );
-	kReduceBornSumAndForces( gpu );
-	gpuDumpObcLoop1( gpu );
-}
-
+                kCalculateObcGbsaN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit, gpu->sim.workUnits);
+            break;
+        case CUTOFF:
+            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            else
+                kCalculateObcGbsaCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            break;
+        case PERIODIC:
+            numWithInteractions = gpu->psInteractionCount->_pSysData[0];
+            if (gpu->bOutputBufferPerWarp)
+                kCalculateObcGbsaPeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            else
+                kCalculateObcGbsaPeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
+                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit, numWithInteractions);
+            break;
+    }
    LAUNCHERROR("kCalculateObcGbsaForces2");
 }
--- a/platforms/cuda/src/kernels/kForces.cu
+++ b/platforms/cuda/src/kernels/kForces.cu
@@ -61,9 +61,9 @@ void GetForcesSim(gpuContext gpu)
 __global__ void kClearForces_kernel()
 {
    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    while (pos < cSim.stride4 * cSim.outputBuffers)
+    while (pos < cSim.stride * cSim.outputBuffers)
    {
-        ((float*)cSim.pForce4)[pos] = 0.0f;
+        cSim.pForce4[pos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
        pos += gridDim.x * blockDim.x;
    }
 }

--- a/platforms/cuda/src/kernels/kVerletUpdate.cu
+++ b/platforms/cuda/src/kernels/kVerletUpdate.cu
@@ -61,7 +61,6 @@ void GetVerletUpdateSim(gpuContext gpu)
 __global__ void kVerletUpdatePart1_kernel()
 {
    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
-    __syncthreads();
    
    while (pos < cSim.atoms)
    {
@@ -175,7 +174,6 @@ void kVerletUpdatePart1(gpuContext gpu)
 __global__ void kVerletUpdatePart2_kernel()
 {
    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
-    __syncthreads();
    
    while (pos < cSim.atoms)
    {
@@ -208,7 +206,6 @@ __global__ void kVerletUpdatePart2CM_kernel()
    extern __shared__ float3 sCM[];
    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
    float3 CM                   = {0.0f, 0.0f, 0.0f};
-    __syncthreads();
    
    while (pos < cSim.atoms)
    {

--- a/platforms/reference/src/ReferenceKernels.cpp
+++ b/platforms/reference/src/ReferenceKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2008 Stanford University and the Authors.           *
+ * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -419,6 +419,26 @@ void ReferenceCalcGBSAOBCForceKernel::initialize(const System& system, const GBS
    obcParameters->setScaledRadiusFactors(scaleFactors);
    obcParameters->setSolventDielectric( static_cast<RealOpenMM>(force.getSolventDielectric()) );
    obcParameters->setSoluteDielectric( static_cast<RealOpenMM>(force.getSoluteDielectric()) );
+
+    // If there is a NonbondedForce in this system, use it to initialize cutoffs and periodic boundary conditions.
+
+    for (int i = 0; i < system.getNumForces(); i++) {
+        const NonbondedForce* nonbonded = dynamic_cast<const NonbondedForce*>(&system.getForce(i));
+        if (nonbonded != NULL) {
+            if (nonbonded->getNonbondedMethod() != NonbondedForce::NoCutoff)
+                obcParameters->setUseCutoff(nonbonded->getCutoffDistance());
+            if (nonbonded->getNonbondedMethod() == NonbondedForce::CutoffPeriodic) {
+                Vec3 boxVectors[3];
+                nonbonded->getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
+                RealOpenMM periodicBoxSize[3];
+                periodicBoxSize[0] = (RealOpenMM) boxVectors[0][0];
+                periodicBoxSize[1] = (RealOpenMM) boxVectors[1][1];
+                periodicBoxSize[2] = (RealOpenMM) boxVectors[2][2];
+                obcParameters->setPeriodic(periodicBoxSize);
+            }
+            break;
+        }
+    }
    obc = new CpuObc(obcParameters);
    obc->setIncludeAceApproximation(true);
 }