Reduced memory use for exclusions. Also deleted some obsolete files.

8055a541 · Peter Eastman · b98859ec · 8055a541 · 8055a541 · 8055a541
Commit 8055a541 authored Feb 27, 2009 by Peter Eastman
8 changed files
--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -277,7 +277,6 @@ struct cudaGmxSimulation {
    unsigned int    stride2;                        // Atomic attributes stride x 2
    unsigned int    stride3;                        // Atomic attributes stride x 3
    unsigned int    stride4;                        // Atomic attributes stride x 4
-    unsigned int    exclusionStride;                // Exclusion list stride = stride / GRID
    unsigned int	nonbondOutputBuffers;           // Nonbond output buffers per nonbond call
    unsigned int    totalNonbondOutputBuffers;      // Total nonbond output buffers
    unsigned int    outputBuffers;                  // Number of output buffers
@@ -357,6 +356,7 @@ struct cudaGmxSimulation {
    int4*           pSettleID;                      // Settle atoms
    float2*         pSettleParameter;               // Settle parameters
    unsigned int*   pExclusion;                     // Nonbond exclusion data
+    unsigned int*   pExclusionIndex;                // Index of exclusion data for each work unit
    unsigned int    bond_offset;                    // Offset to end of bonds
    unsigned int    bond_angle_offset;              // Offset to end of bond angles
    unsigned int    dihedral_offset;                // Offset to end of dihedrals

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -1172,7 +1172,6 @@ int gpuAllocateInitialBuffers(gpuContext gpu)
    gpu->sim.stride2                    = 2 * gpu->sim.stride;
    gpu->sim.stride3                    = 3 * gpu->sim.stride;
    gpu->sim.stride4                    = 4 * gpu->sim.stride;
-    gpu->sim.exclusionStride            = gpu->sim.stride / GRID;
    gpu->psPosqP4                       = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
    gpu->sim.pPosqP                     = gpu->psPosqP4->_pDevStream[0];
    gpu->psOldPosq4                     = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
@@ -1533,6 +1532,7 @@ void* gpuInit(int numAtoms)
    gpu->psSettleID                 = NULL;
    gpu->psSettleParameter          = NULL;
    gpu->psExclusion                = NULL;
+    gpu->psExclusionIndex           = NULL;
    gpu->psWorkUnit                 = NULL;
    gpu->psInteractingWorkUnit      = NULL;
    gpu->psInteractionFlag          = NULL;
@@ -1665,6 +1665,7 @@ void gpuShutDown(gpuContext gpu)
    delete gpu->psSettleID;
    delete gpu->psSettleParameter;
    delete gpu->psExclusion;
+    delete gpu->psExclusionIndex;
    delete gpu->psWorkUnit;
    delete gpu->psInteractingWorkUnit;
    delete gpu->psInteractionFlag;
@@ -1871,31 +1872,74 @@ void gpuBuildExclusionList(gpuContext gpu)
 {
    const unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
    const unsigned int grid = gpu->grid;
-    const unsigned int dim = (atoms+(grid-1))/grid;
-    CUDAStream<unsigned int>* psExclusion = new CUDAStream<unsigned int>((atoms*atoms+grid-1) / grid, 1u);
-    gpu->psExclusion = psExclusion;
-    gpu->sim.pExclusion = psExclusion->_pDevStream[0];
-    unsigned int* pExList = psExclusion->_pSysStream[0];
+    const unsigned int dim = atoms/grid;
    unsigned int* pWorkList = gpu->psWorkUnit->_pSysStream[0];
-    for (int i = 0; i < psExclusion->_length; ++i)
-        pExList[i] = 0xFFFFFFFF;

-    // Fill in the exclusions.
+    // Mark which work units have exclusions.

    for (int atom1 = 0; atom1 < gpu->exclusions.size(); ++atom1)
    {
        int x = atom1/grid;
-        int offset = atom1-x*grid;
        for (int j = 0; j < gpu->exclusions[atom1].size(); ++j)
        {
            int atom2 = gpu->exclusions[atom1][j];
            int y = atom2/grid;
-            int index = x*atoms+y*grid+offset;
-            pExList[index] &= 0xFFFFFFFF-(1<<(atom2-y*grid));
            int cell = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
            pWorkList[cell] |= 1;
        }
    }
+    if (gpu->sim.paddedNumberOfAtoms > gpu->natoms)
+    {
+        int lastBlock = gpu->natoms/grid;
+        for (int i = 0; i < gpu->sim.workUnits; ++i)
+        {
+            int x = pWorkList[i]>>17;
+            int y = (pWorkList[i]>>2)&0x7FFF;
+            if (x == lastBlock || y == lastBlock)
+                pWorkList[i] |= 1;
+        }
+    }
+
+    // Build a list of indexes for the work units with exclusions.
+
+    CUDAStream<unsigned int>* psExclusionIndex = new CUDAStream<unsigned int>(gpu->sim.workUnits, 1u);
+    gpu->psExclusionIndex = psExclusionIndex;
+    unsigned int* pExclusionIndex = psExclusionIndex->_pSysData;
+    gpu->sim.pExclusionIndex = psExclusionIndex->_pDevData;
+    int numWithExclusions = 0;
+    for (int i = 0; i < psExclusionIndex->_length; ++i)
+        if ((pWorkList[i]&1) == 1)
+            pExclusionIndex[i] = (numWithExclusions++)*grid;
+
+    // Record the exclusion data.
+
+    CUDAStream<unsigned int>* psExclusion = new CUDAStream<unsigned int>(numWithExclusions*grid, 1u);
+    gpu->psExclusion = psExclusion;
+    unsigned int* pExclusion = psExclusion->_pSysData;
+    gpu->sim.pExclusion = psExclusion->_pDevData;
+    for (int i = 0; i < psExclusion->_length; ++i)
+        pExclusion[i] = 0xFFFFFFFF;
+    for (int atom1 = 0; atom1 < gpu->exclusions.size(); ++atom1)
+    {
+        int x = atom1/grid;
+        int offset1 = atom1-x*grid;
+        for (int j = 0; j < gpu->exclusions[atom1].size(); ++j)
+        {
+            int atom2 = gpu->exclusions[atom1][j];
+            int y = atom2/grid;
+            int offset2 = atom2-y*grid;
+            if (x > y)
+            {
+                int cell = x+y*dim-y*(y+1)/2;
+                pExclusion[pExclusionIndex[cell]+offset1] &= 0xFFFFFFFF-(1<<offset2);
+            }
+            else
+            {
+                int cell = y+x*dim-x*(x+1)/2;
+                pExclusion[pExclusionIndex[cell]+offset2] &= 0xFFFFFFFF-(1<<offset1);
+            }
+        }
+    }

    // Mark all interactions that involve a padding atom as being excluded.

@@ -1907,16 +1951,22 @@ void gpuBuildExclusionList(gpuContext gpu)
        {
            int y = atom2/grid;
            int index = x*atoms+y*grid+offset1;
-            pExList[index] &= 0xFFFFFFFF-(1<<(atom2-y*grid));
            int offset2 = atom2-y*grid;
-            index = y*atoms+x*grid+offset2;
-            pExList[index] &= 0xFFFFFFFF-(1<<(atom1-x*grid));
-            int cell = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
-            pWorkList[cell] |= 1;
+            if (x >= y)
+            {
+                int cell = x+y*dim-y*(y+1)/2;
+                pExclusion[pExclusionIndex[cell]+offset1] &= 0xFFFFFFFF-(1<<offset2);
+            }
+            if (y >= x)
+            {
+                int cell = y+x*dim-x*(x+1)/2;
+                pExclusion[pExclusionIndex[cell]+offset2] &= 0xFFFFFFFF-(1<<offset1);
+            }
        }
    }
    
    psExclusion->Upload();
+    psExclusionIndex->Upload();
    gpu->psWorkUnit->Upload();
    gpuSetConstants(gpu);
 }

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -116,6 +116,7 @@ struct _gpuContext {
    CUDAStream<int4>* psSettleID;
    CUDAStream<float2>* psSettleParameter;
    CUDAStream<unsigned int>* psExclusion;
+    CUDAStream<unsigned int>* psExclusionIndex;
    CUDAStream<unsigned int>* psWorkUnit;
    CUDAStream<unsigned int>* psInteractingWorkUnit;
    CUDAStream<unsigned int>* psInteractionFlag;

--- a/platforms/cuda/src/kernels/kCalculateCDLJForces.h
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces.h
@@ -126,7 +126,9 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
            }
            else  // bExclusion
            {
-                unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
+                unsigned int xi   = x>>GRIDBITS;
+                int cell          = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
+                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
                for (unsigned int j = 0; j < GRID; j++)
                {
                    dx              = psA[j].x - apos.x;
@@ -253,7 +255,10 @@ __global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUni
            else  // bExclusion
            {
                // Read fixed atom data into registers and GRF
-                unsigned int excl       = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
+                unsigned int xi   = x>>GRIDBITS;
+                unsigned int yi   = y>>GRIDBITS;
+                int cell          = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
+                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
                excl              = (excl >> tgx) | (excl << (GRID - tgx));
                for (unsigned int j = 0; j < GRID; j++)
                {

--- a/platforms/cuda/src/kernels/kCalculateCDLJForces_12.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces_12.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-
-#include "gputypes.h"
-#include "cudatypes.h"
-
-#define UNROLLXX 0
-#define UNROLLXY 0
-
-struct Atom {
-    float x;
-    float y;
-    float z;
-    float q;
-    float sig;
-    float eps;
-    float fx;
-    float fy;
-    float fz;
-};
-
-
-__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
-__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
-__shared__ unsigned int sNext[GRID];
-
-static __constant__ cudaGmxSimulation cSim;
-
-void SetCalculateCDLJForces_12Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-
-void GetCalculateCDLJForces_12Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-
-__global__ void kCalculateCDLJForces_12_kernel()
-{
-    // Read queue of work blocks once so the remainder of
-    // kernel can run asynchronously    
-    int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
-    int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);    
-    if (threadIdx.x < end - pos)
-    {
-        sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
-    }
-    if (threadIdx.x < GRID)
-    {
-        sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
-    }
-    __syncthreads();
-
-    // Now change pos and end to reflect work queue just read
-    // into shared memory
-    end = end - pos; 
-    pos = end - (threadIdx.x >> GRIDBITS) - 1;
-       
-    while (pos >= 0)
-    {  
-    
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x = sWorkUnit[pos];
-        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
-        bool bExclusionFlag = (x & 0x1);
-        x = (x >> 17) << GRIDBITS;
-        float4      apos;   // Local atom x, y, z, q
-        float3      af;     // Local atom fx, fy, fz
-        float dx; 
-        float dy; 
-        float dz; 
-        float r2; 
-        float invR; 
-        float sig; 
-        float sig2; 
-        float sig6; 
-        float eps; 
-        float dEdR;  
-        unsigned int tgx = threadIdx.x & (GRID - 1);
-        unsigned int tbx = threadIdx.x - tgx;
-        int tj = tgx; 
-        Atom* psA = &sA[tbx];
-        if (!bExclusionFlag)
-        {
-            if (x == y) // Handle diagonals uniquely at 50% efficiency
-            { 
-                // Read fixed atom data into registers and GRF
-                unsigned int i      = x + tgx;
-                apos                = cSim.pPosq[i];
-                float2 a            = cSim.pAttr[i];
-                sA[threadIdx.x].x   = apos.x;
-                sA[threadIdx.x].y   = apos.y;
-                sA[threadIdx.x].z   = apos.z;
-                sA[threadIdx.x].q   = apos.w;
-                sA[threadIdx.x].sig = a.x;
-                sA[threadIdx.x].eps = a.y;
-                af.x                = 0.0f;
-                af.y                = 0.0f;
-                af.z                = 0.0f;
-                apos.w             *= cSim.epsfac;
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    dx              = psA[j].x - apos.x; 
-                    dy              = psA[j].y - apos.y; 
-                    dz              = psA[j].z - apos.z; 
-                    r2              = dx * dx + dy * dy + dz * dz; 
-                    invR            = 1.0f / sqrt(r2);
-                    sig             = a.x + psA[j].sig; 
-                    sig2            = invR * sig; 
-                    sig2           *= sig2;
-                    sig6            = sig2 * sig2 * sig2; 
-                    eps             = a.y * psA[j].eps; 
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR           += apos.w * psA[j].q * invR; 
-                    dEdR           *= invR * invR; 
-                    dx             *= dEdR; 
-                    dy             *= dEdR; 
-                    dz             *= dEdR; 
-                    af.x           -= dx; 
-                    af.y           -= dy; 
-                    af.z           -= dz; 
-                }
-                
-                // Write results
-                float4 of;
-                of.x                                = af.x;
-                of.y                                = af.y;
-                of.z                                = af.z;
-                of.w                                = 0.0f;
-                int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]               = of;
-            }         
-            else        // 100% utilization
-            {
-                // Read fixed atom data into registers and GRF
-                int j                   = y + tgx;
-                unsigned int i          = x + tgx;
-                float4 temp             = cSim.pPosq[j];
-                float2 temp1            = cSim.pAttr[j];
-                apos                    = cSim.pPosq[i];
-                float2 a                = cSim.pAttr[i];
-                sA[threadIdx.x].x       = temp.x;
-                sA[threadIdx.x].y       = temp.y;
-                sA[threadIdx.x].z       = temp.z;
-                sA[threadIdx.x].q       = temp.w;
-                sA[threadIdx.x].sig     = temp1.x;
-                sA[threadIdx.x].eps     = temp1.y;
-                sA[threadIdx.x].fx      = af.x = 0.0f;
-                sA[threadIdx.x].fy      = af.y = 0.0f;
-                sA[threadIdx.x].fz      = af.z = 0.0f;
-                apos.w                 *= cSim.epsfac;
-                
-                for (j = 0; j < GRID; j++)
-                {
-                    dx              = psA[tj].x - apos.x; 
-                    dy              = psA[tj].y - apos.y; 
-                    dz              = psA[tj].z - apos.z; 
-                    r2              = dx * dx + dy * dy + dz * dz; 
-                    invR            = 1.0f / sqrt(r2);
-                    sig             = a.x + psA[tj].sig; 
-                    sig2            = invR * sig; 
-                    sig2           *= sig2;
-                    sig6            = sig2 * sig2 * sig2; 
-                    eps             = a.y * psA[tj].eps; 
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR           += apos.w * psA[tj].q * invR; 
-                    dEdR           *= invR * invR; 
-                    dx             *= dEdR; 
-                    dy             *= dEdR; 
-                    dz             *= dEdR; 
-                    af.x           -= dx; 
-                    af.y           -= dy; 
-                    af.z           -= dz; 
-                    psA[tj].fx     += dx; 
-                    psA[tj].fy     += dy; 
-                    psA[tj].fz     += dz;
-                    tj              = sNext[tj]; 
-                }
-                
-                // Write results
-                float4 of;
-                of.x                                = af.x;
-                of.y                                = af.y;
-                of.z                                = af.z;
-                of.w                                = 0.0f;
-                int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]               = of;
-                of.x                                = sA[threadIdx.x].fx;
-                of.y                                = sA[threadIdx.x].fy;
-                of.z                                = sA[threadIdx.x].fz;
-                offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]               = of;
-            }
-        }
-        else  // bExclusion
-        {
-            // Read exclusion data
-            
-            if (x == y) // Handle diagonals uniquely at 50% efficiency
-            { 
-                // Read fixed atom data into registers and GRF
-                unsigned int excl       = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];                          
-                unsigned int i          = x + tgx;
-                apos                    = cSim.pPosq[i];
-                float2 a                = cSim.pAttr[i];
-                sA[threadIdx.x].x       = apos.x;
-                sA[threadIdx.x].y       = apos.y;
-                sA[threadIdx.x].z       = apos.z;
-                sA[threadIdx.x].q       = apos.w;
-                sA[threadIdx.x].sig     = a.x;
-                sA[threadIdx.x].eps     = a.y;
-                af.x                    = 0.0f;
-                af.y                    = 0.0f;
-                af.z                    = 0.0f;
-                apos.w                 *= cSim.epsfac;
-                
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    dx              = psA[j].x - apos.x; 
-                    dy              = psA[j].y - apos.y; 
-                    dz              = psA[j].z - apos.z; 
-                    r2              = dx * dx + dy * dy + dz * dz; 
-                    invR            = 1.0f / sqrt(r2);
-                    sig             = a.x + psA[j].sig; 
-                    sig2            = invR * sig; 
-                    sig2           *= sig2;
-                    sig6            = sig2 * sig2 * sig2; 
-                    eps             = a.y * psA[j].eps; 
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR           += apos.w * psA[j].q * invR; 
-                    dEdR           *= invR * invR; 
-                    if (!(excl & 0x1))
-                    {
-                        dEdR = 0.0f;
-                    }
-                    dx             *= dEdR; 
-                    dy             *= dEdR; 
-                    dz             *= dEdR; 
-                    af.x           -= dx; 
-                    af.y           -= dy; 
-                    af.z           -= dz;
-                    excl          >>= 1;               
-                }
-                
-                // Write results
-                float4 of;
-                of.x                                = af.x;
-                of.y                                = af.y;
-                of.z                                = af.z;
-                of.w                                = 0.0f;
-                int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]               = of;
-            }         
-            else        // 100% utilization
-            {
-                // Read fixed atom data into registers and GRF        
-                unsigned int excl       = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
-                excl                    = (excl >> tgx) | (excl << (GRID - tgx));
-                int j                   = y + tgx;
-                unsigned int i          = x + tgx;
-                float4 temp             = cSim.pPosq[j];
-                float2 temp1            = cSim.pAttr[j];
-                apos                    = cSim.pPosq[i];
-                float2 a                = cSim.pAttr[i];
-                sA[threadIdx.x].x       = temp.x;
-                sA[threadIdx.x].y       = temp.y;
-                sA[threadIdx.x].z       = temp.z;
-                sA[threadIdx.x].q       = temp.w;
-                sA[threadIdx.x].sig     = temp1.x;
-                sA[threadIdx.x].eps     = temp1.y;
-                sA[threadIdx.x].fx      = af.x = 0.0f;
-                sA[threadIdx.x].fy      = af.y = 0.0f;
-                sA[threadIdx.x].fz      = af.z = 0.0f;
-                apos.w                 *= cSim.epsfac;
-                
-                for (j = 0; j < GRID; j++)
-                {
-                    dx              = psA[tj].x - apos.x; 
-                    dy              = psA[tj].y - apos.y; 
-                    dz              = psA[tj].z - apos.z; 
-                    r2              = dx * dx + dy * dy + dz * dz; 
-                    invR            = 1.0f / sqrt(r2);
-                    sig             = a.x + psA[tj].sig; 
-                    sig2            = invR * sig; 
-                    sig2           *= sig2;
-                    sig6            = sig2 * sig2 * sig2; 
-                    eps             = a.y * psA[tj].eps; 
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR           += apos.w * psA[tj].q * invR; 
-                    dEdR           *= invR * invR; 
-                    if (!(excl & 0x1))
-                    {
-                        dEdR = 0.0f;
-                    }
-                    dx             *= dEdR; 
-                    dy             *= dEdR; 
-                    dz             *= dEdR; 
-                    af.x           -= dx; 
-                    af.y           -= dy; 
-                    af.z           -= dz; 
-                    psA[tj].fx     += dx; 
-                    psA[tj].fy     += dy; 
-                    psA[tj].fz     += dz;
-                    excl          >>= 1;
-                    tj              = sNext[tj]; 
-                }
-                
-                // Write results
-                float4 of;
-                of.x                                = af.x;
-                of.y                                = af.y;
-                of.z                                = af.z;
-                of.w                                = 0.0f;
-                int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]               = of;
-                of.x                                = sA[threadIdx.x].fx;
-                of.y                                = sA[threadIdx.x].fy;
-                of.z                                = sA[threadIdx.x].fz;
-                offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]               = of;
-            }
-        }
-
-        pos -= cSim.nonbond_workBlock;     
-    }
-}
-
-void kCalculateCDLJForces_12(gpuContext gpu)
-{
-//    printf("kCalculateCDLJForces_12\n");
-    kCalculateCDLJForces_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
-    LAUNCHERROR("kCalculateCDLJForces_12");
-}
--- a/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.h
+++ b/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.h
@@ -134,7 +134,9 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
            }
            else  // bExclusion
            {
-                unsigned int excl           = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
+                unsigned int xi   = x>>GRIDBITS;
+                int cell          = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
+                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
                for (unsigned int j = 0; j < GRID; j++)
                {
                    float dx                = psA[j].x - apos.x;
@@ -299,7 +301,10 @@ __global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int*
            }
            else  // bExclusion
            {
-                unsigned int excl           = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
+                unsigned int xi   = x>>GRIDBITS;
+                unsigned int yi   = y>>GRIDBITS;
+                int cell          = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
+                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
                excl              = (excl >> tgx) | (excl << (GRID - tgx));
                for (int j = 0; j < GRID; j++)
                {

--- a/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1_12.cu
+++ b/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1_12.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-
-#include "gputypes.h"
-#include "cudatypes.h"
-
-#define UNROLLXX 0
-#define UNROLLXY 0
-
-struct Atom {
-    float x;
-    float y;
-    float z;
-    float q;
-    float sig;
-    float eps;
-    float br;
-    float fx;
-    float fy;
-    float fz;
-    float fb;
-};
-
-
-__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
-__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
-__shared__ unsigned int sNext[GRID];
-
-static __constant__ cudaGmxSimulation cSim;
-
-void SetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-
-void GetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-
-__global__ void kCalculateCDLJObcGbsaForces1_12_kernel()
-{
-    // Read queue of work blocks once so the remainder of
-    // kernel can run asynchronously    
-    int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
-    int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);    
-    if (threadIdx.x < end - pos)
-    {
-        sWorkUnit[threadIdx.x]              = cSim.pWorkUnit[pos + threadIdx.x];
-    }
-    if (threadIdx.x < GRID)
-    {
-        sNext[threadIdx.x]                  = (threadIdx.x + 1) & (GRID - 1);
-    }
-    __syncthreads();
-
-    // Now change pos and end to reflect work queue just read
-    // into shared memory
-    end                                     = end - pos; 
-    pos                                     = end - (threadIdx.x >> GRIDBITS) - 1;
-       
-    while (pos >= 0)
-    {  
-    
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x                      = sWorkUnit[pos];
-        unsigned int y                      = ((x >> 2) & 0x7fff) << GRIDBITS;
-        bool bExclusionFlag                 = (x & 0x1);
-        x                                   = (x >> 17) << GRIDBITS;
-        unsigned int tgx                    = threadIdx.x & (GRID - 1);
-        unsigned int i                      = x + tgx;
-        float4 apos                         = cSim.pPosq[i];
-        float2 a                            = cSim.pAttr[i];
-        float br                            = cSim.pBornRadii[i];        
-        unsigned int tbx                    = threadIdx.x - tgx;
-        int tj                              = tgx; 
-        Atom* psA                           = &sA[tbx];
-        if (!bExclusionFlag)
-        {
-            if (x == y) // Handle diagonals uniquely at 50% efficiency
-            { 
-                // Read fixed atom data into registers and GRF
-                sA[threadIdx.x].x           = apos.x;
-                sA[threadIdx.x].y           = apos.y;
-                sA[threadIdx.x].z           = apos.z;
-                sA[threadIdx.x].q           = apos.w;
-                float q2                    = cSim.preFactor * apos.w;
-                apos.w                     *= cSim.epsfac;
-                sA[threadIdx.x].sig         = a.x;
-                sA[threadIdx.x].eps         = a.y;
-                sA[threadIdx.x].br          = br; 
-                float4 af;
-                af.x                        = 0.0f;
-                af.y                        = 0.0f;
-                af.z                        = 0.0f;
-                af.w                        = 0.0f;
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float dx                = psA[j].x - apos.x; 
-                    float dy                = psA[j].y - apos.y; 
-                    float dz                = psA[j].z - apos.z; 
-                    float r2                = dx * dx + dy * dy + dz * dz; 
-                                     
-                    // CDLJ part
-                    float invR              = 1.0f / sqrt(r2);
-                    float sig               = a.x + psA[j].sig; 
-                    float sig2              = invR * sig; 
-                    sig2                   *= sig2;
-                    float sig6              = sig2 * sig2 * sig2; 
-                    float eps               = a.y * psA[j].eps; 
-                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR                   += apos.w * psA[j].q * invR; 
-                    dEdR                   *= invR * invR; 
-
-                    // ObcGbsaForce1 part
-                    float alpha2_ij         = br * psA[j].br;
-                    float D_ij              = r2 / (4.0f * alpha2_ij);
-                    float expTerm           = exp(-D_ij);
-                    float denominator2      = r2 + alpha2_ij * expTerm;
-                    float denominator       = sqrt(denominator2);
-                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
-                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
-                    af.w                   += dGpol_dalpha2_ij * psA[j].br;   
-                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm); 
-                    
-                    // Add Forces
-                    dx                     *= dEdR; 
-                    dy                     *= dEdR; 
-                    dz                     *= dEdR; 
-                    af.x                   -= dx; 
-                    af.y                   -= dy; 
-                    af.z                   -= dz; 
-                }
-                
-                // Write results
-                int offset                  = x + tgx + (x >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]       = af;
-                cSim.pBornForce[offset]     = af.w;
-            }         
-            else        // 100% utilization
-            {
-                // Read fixed atom data into registers and GRF
-                int j                       = y + tgx;
-                float4 temp                 = cSim.pPosq[j];
-                float2 temp1                = cSim.pAttr[j];
-                sA[threadIdx.x].br          = cSim.pBornRadii[j];
-                float4 af;
-                sA[threadIdx.x].fx          = af.x = 0.0f;
-                sA[threadIdx.x].fy          = af.y = 0.0f;
-                sA[threadIdx.x].fz          = af.z = 0.0f;
-                sA[threadIdx.x].fb          = af.w = 0.0f;
-                float q2                    = apos.w * cSim.preFactor;
-                apos.w                     *= cSim.epsfac;                
-                sA[threadIdx.x].x           = temp.x;
-                sA[threadIdx.x].y           = temp.y;
-                sA[threadIdx.x].z           = temp.z;
-                sA[threadIdx.x].q           = temp.w;
-                sA[threadIdx.x].sig         = temp1.x;
-                sA[threadIdx.x].eps         = temp1.y;
-   
-                for (j = 0; j < GRID; j++)
-                {
-                    float dx                = psA[tj].x - apos.x; 
-                    float dy                = psA[tj].y - apos.y; 
-                    float dz                = psA[tj].z - apos.z; 
-                    float r2                = dx * dx + dy * dy + dz * dz; 
-                    
-                    // CDLJ part
-                    float invR              = 1.0f / sqrt(r2);
-                    float sig               = a.x + psA[tj].sig; 
-                    float sig2              = invR * sig; 
-                    sig2                   *= sig2;
-                    float sig6              = sig2 * sig2 * sig2; 
-                    float eps               = a.y * psA[tj].eps; 
-                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR                   += apos.w * psA[tj].q * invR; 
-                    dEdR                   *= invR * invR; 
-                  
-                    // ObcGbsaForce1 part
-                    float alpha2_ij         = br * psA[tj].br;
-                    float D_ij              = r2 / (4.0f * alpha2_ij);
-                    float expTerm           = exp(-D_ij);
-                    float denominator2      = r2 + alpha2_ij * expTerm;
-                    float denominator       = sqrt(denominator2);
-                    float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
-                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
-                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;  
-                    psA[tj].fb             += dGpol_dalpha2_ij * br;      
-                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm); 
-                    
-                    // Add forces
-                    dx                     *= dEdR; 
-                    dy                     *= dEdR; 
-                    dz                     *= dEdR; 
-                    af.x                   -= dx; 
-                    af.y                   -= dy; 
-                    af.z                   -= dz;    
-                    psA[tj].fx             += dx; 
-                    psA[tj].fy             += dy; 
-                    psA[tj].fz             += dz;
-                    tj                      = sNext[tj]; 
-                }
-                
-                // Write results
-                int offset                  = x + tgx + (y >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]       = af;
-                cSim.pBornForce[offset]     = af.w;
-                af.x                        = sA[threadIdx.x].fx;
-                af.y                        = sA[threadIdx.x].fy;
-                af.z                        = sA[threadIdx.x].fz;
-                offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]       = af;
-                cSim.pBornForce[offset]     = sA[threadIdx.x].fb;
-            }
-        }
-        else  // bExclusion
-        {
-            // Read exclusion data
-            
-            if (x == y) // Handle diagonals uniquely at 50% efficiency
-            { 
-                // Read fixed atom data into registers and GRF
-                unsigned int excl           = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
-                float4 af;
-                af.x                        = 0.0f;
-                af.y                        = 0.0f;
-                af.z                        = 0.0f;
-                af.w                        = 0.0f;                                      
-                sA[threadIdx.x].x           = apos.x;
-                sA[threadIdx.x].y           = apos.y;
-                sA[threadIdx.x].z           = apos.z;
-                sA[threadIdx.x].q           = apos.w;
-                float q2                    = cSim.preFactor * apos.w;
-                apos.w                     *= cSim.epsfac;
-                sA[threadIdx.x].sig         = a.x;
-                sA[threadIdx.x].eps         = a.y;
-                sA[threadIdx.x].br          = br;
-
-                
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float dx                = psA[j].x - apos.x; 
-                    float dy                = psA[j].y - apos.y; 
-                    float dz                = psA[j].z - apos.z; 
-                    float r2                = dx * dx + dy * dy + dz * dz; 
-                   
-                    // CDLJ part
-                    float invR              = 1.0f / sqrt(r2);
-                    float sig               = a.x + psA[j].sig; 
-                    float sig2              = invR * sig; 
-                    sig2                   *= sig2;
-                    float sig6              = sig2 * sig2 * sig2; 
-                    float eps               = a.y * psA[j].eps; 
-                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR                   += apos.w * psA[j].q * invR; 
-                    dEdR                   *= invR * invR;
-                    if (!(excl & 0x1))
-                    {
-                        dEdR = 0.0f;
-                    } 
-                  
-                    // ObcGbsaForce1 part
-                    float alpha2_ij         = br * psA[j].br;
-                    float D_ij              = r2 / (4.0f * alpha2_ij);
-                    float expTerm           = exp(-D_ij);
-                    float denominator2      = r2 + alpha2_ij * expTerm;
-                    float denominator       = sqrt(denominator2);
-                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
-                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
-                    af.w                   += dGpol_dalpha2_ij * psA[j].br;   
-                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm); 
-
-                    // Add Forces
-                    dx                     *= dEdR; 
-                    dy                     *= dEdR; 
-                    dz                     *= dEdR; 
-                    af.x                   -= dx; 
-                    af.y                   -= dy; 
-                    af.z                   -= dz; 
-                    excl                  >>= 1;               
-                }
-                
-                // Write results
-                int offset                  = x + tgx + (x >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]       = af;
-                cSim.pBornForce[offset]     = af.w;
-            }         
-            else        // 100% utilization
-            {
-                // Read fixed atom data into registers and GRF        
-                unsigned int excl           = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
-                float4 af;
-                sA[threadIdx.x].fx          = af.x = 0.0f;
-                sA[threadIdx.x].fy          = af.y = 0.0f;
-                sA[threadIdx.x].fz          = af.z = 0.0f;
-                sA[threadIdx.x].fb          = af.w = 0.0f;
-                int j                       = y + tgx;
-                float q2                    = cSim.preFactor * apos.w;
-                apos.w                     *= cSim.epsfac;
-                float4 temp                 = cSim.pPosq[j];
-                float2 temp1                = cSim.pAttr[j];
-                sA[threadIdx.x].br          = cSim.pBornRadii[j];
-                excl                        = (excl >> tgx) | (excl << (GRID - tgx));                
-                sA[threadIdx.x].x           = temp.x;
-                sA[threadIdx.x].y           = temp.y;
-                sA[threadIdx.x].z           = temp.z;
-                sA[threadIdx.x].q           = temp.w;
-                sA[threadIdx.x].sig         = temp1.x;
-                sA[threadIdx.x].eps         = temp1.y;
-                
-                for (j = 0; j < GRID; j++)
-                {
-                    float dx                = psA[tj].x - apos.x; 
-                    float dy                = psA[tj].y - apos.y; 
-                    float dz                = psA[tj].z - apos.z; 
-                    float r2                = dx * dx + dy * dy + dz * dz; 
-                    
-                    // CDLJ part
-                    float invR              = 1.0f / sqrt(r2);
-                    float sig               = a.x + psA[tj].sig; 
-                    float sig2              = invR * sig; 
-                    sig2                   *= sig2;
-                    float sig6              = sig2 * sig2 * sig2; 
-                    float eps               = a.y * psA[tj].eps; 
-                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6; 
-                    dEdR                   += apos.w * psA[tj].q * invR; 
-                    dEdR                   *= invR * invR;
-                    if (!(excl & 0x1))
-                    {
-                        dEdR = 0.0f;
-                    } 
-   
-                    // ObcGbsaForce1 part
-                    float alpha2_ij         = br * psA[tj].br;
-                    float D_ij              = r2 / (4.0f * alpha2_ij);
-                    float expTerm           = exp(-D_ij);
-                    float denominator2      = r2 + alpha2_ij * expTerm;
-                    float denominator       = sqrt(denominator2);
-                    float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
-                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij); 
-                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;  
-                    psA[tj].fb             += dGpol_dalpha2_ij * br;      
-                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm); 
-                   
-                    // Add forces
-                    dx                     *= dEdR; 
-                    dy                     *= dEdR; 
-                    dz                     *= dEdR; 
-                    af.x                   -= dx; 
-                    af.y                   -= dy; 
-                    af.z                   -= dz;    
-                    psA[tj].fx             += dx; 
-                    psA[tj].fy             += dy; 
-                    psA[tj].fz             += dz;
-                    excl                  >>= 1;
-                    tj                      = sNext[tj]; 
-                }
-                
-                // Write results
-                int offset                  = x + tgx + (y >> GRIDBITS) * cSim.stride;
-                cSim.pForce4a[offset]       = af;
-                cSim.pBornForce[offset]     = af.w;
-                offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
-                af.x                        = sA[threadIdx.x].fx;
-                af.y                        = sA[threadIdx.x].fy;
-                af.z                        = sA[threadIdx.x].fz;
-                cSim.pForce4a[offset]       = af;
-                cSim.pBornForce[offset]     = sA[threadIdx.x].fb;
-            }
-        }
-
-        pos -= cSim.nonbond_workBlock;     
-    }
-}
-
-void kCalculateCDLJObcGbsaForces1_12(gpuContext gpu)
-{
-//    printf("kCalculateCDLJObcGbsaForces1_12\n");
-    kCalculateCDLJObcGbsaForces1_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
-    LAUNCHERROR("kCalculateCDLJObcGbsaForces1_12");
-}
--- a/platforms/cuda/src/kernels/kCalculateObcGbsaForces2_12.cu
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaForces2_12.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-
-#include "gputypes.h"
-
-struct Atom {
-    float x;
-    float y;
-    float z;
-    float r;
-    float sr;
-    float sr2;
-    float fx;
-    float fy;
-    float fz;
-    float fb;
-//    float sum;
-};
-
-
-__shared__ Atom sA[GT2XX_BORNFORCE2_THREADS_PER_BLOCK];
-__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
-__shared__ unsigned int sNext[GRID];
-
-static __constant__ cudaGmxSimulation cSim;
-
-void SetCalculateObcGbsaForces2_12Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-
-void GetCalculateObcGbsaForces2_12Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-
-__global__ void kCalculateObcGbsaForces2_12_kernel()
-{
-    // Read queue of work blocks once so the remainder of
-    // kernel can run asynchronously    
-    int pos = cSim.bf2WorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.bf2WorkUnitsPerBlockRemainder);
-    int end = cSim.bf2WorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.bf2WorkUnitsPerBlockRemainder);    
-    if (threadIdx.x < end - pos)
-    {
-        sWorkUnit[threadIdx.x]          = cSim.pWorkUnit[pos + threadIdx.x];
-    }
-    if (threadIdx.x < GRID)
-    {
-        sNext[threadIdx.x]              = (threadIdx.x + 1) & (GRID - 1);
-    }
-    __syncthreads();
-
-    // Now change pos and end to reflect work queue just read
-    // into shared memory
-    end                                 = end - pos; 
-    pos                                 = end - (threadIdx.x >> GRIDBITS) - 1;
-       
-    while (pos >= 0)
-    {  
-    
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x                  = sWorkUnit[pos];
-        unsigned int y                  = ((x >> 2) & 0x7fff) << GRIDBITS;
-        x                               = (x >> 17) << GRIDBITS;
-        unsigned int tgx                = threadIdx.x & (GRID - 1);
-        unsigned int i                  = x + tgx;
-        float4 apos                     = cSim.pPosq[i];
-        float2 a                        = cSim.pObcData[i];
-        float fb                        = cSim.pBornForce[i];
-        unsigned int tbx                = threadIdx.x - tgx;
-        int tj                          = tgx; 
-        Atom* psA                       = &sA[tbx];
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        { 
-            // Read fixed atom data into registers and GRF
-            float3 af;
-            sA[threadIdx.x].fx = af.x   = 0.0f;
-            sA[threadIdx.x].fy = af.y   = 0.0f;
-            sA[threadIdx.x].fz = af.z   = 0.0f;
-//            float sum                   = 0.0f;
-            sA[threadIdx.x].x           = apos.x;
-            sA[threadIdx.x].y           = apos.y;
-            sA[threadIdx.x].z           = apos.z;
-//            float oneOverR              = 1.0f / a.x;
-            sA[threadIdx.x].r           = a.x;
-            sA[threadIdx.x].sr          = a.y;
-            sA[threadIdx.x].sr2         = a.y * a.y;
-            sA[threadIdx.x].fb          = fb;
-            
-            for (unsigned int j = sNext[tgx]; j != tgx; j = sNext[j])
-            {
-                float dx                = psA[j].x - apos.x; 
-                float dy                = psA[j].y - apos.y; 
-                float dz                = psA[j].z - apos.z; 
-                float r2                = dx * dx + dy * dy + dz * dz;
-                float r                 = sqrt(r2);
-                
-                
-                // Atom I Born forces and sum
-                float rScaledRadiusJ    = r + psA[j].sr;
-                
-                float l_ij          = 1.0f / max(a.x, fabs(r - psA[j].sr));
-                float u_ij          = 1.0f / rScaledRadiusJ;
-                float rInverse      = 1.0f / r;
-                float l_ij2         = l_ij * l_ij;
-                float u_ij2         = u_ij * u_ij;
-                float r2Inverse     = rInverse * rInverse;
-                float t1            = log (u_ij / l_ij);
-                float t2            = (l_ij2 - u_ij2);
-                float t3            = t2 * rInverse;
-                t1                 *= rInverse;
-                    
-                // Born Forces term
-                float term          =  0.125f * 
-                                      (1.000f + psA[j].sr2 * r2Inverse) * t3 + 
-                                       0.250f * t1 * r2Inverse;
-                float dE            = fb * term;
-                    
-                // Born sum term
-//                term                =  l_ij - u_ij  +
-//                                      -0.25f * r * t2 +
-//                                       0.50f * t1 +
-//                                      (0.25f * psA[j].sr2) * t3;
-//                if (a.x < (psA[j].sr - r))
-//                {
-//                    term           += 2.0f * (oneOverR - l_ij);
-//                }
-                    
-                if (a.x >= rScaledRadiusJ) 
-                {
-                    dE              = /*term =*/ 0.0f;
-                }
-                float d             = dx * dE;
-                af.x               -= d;
-                psA[j].fx          += d;
-                d                   = dy * dE;  
-                af.y               -= d;
-                psA[j].fy          += d;
-                d                   = dz * dE;
-                af.z               -= d;
-                psA[j].fz          += d;                                          
-//                sum                += term;
-            }
-            
-            // Write results
-            int offset                  = x + tgx + (x >> GRIDBITS) * cSim.stride;
-            float4 of;
-            of.x                        = af.x + sA[threadIdx.x].fx;
-            of.y                        = af.y + sA[threadIdx.x].fy;
-            of.z                        = af.z + sA[threadIdx.x].fz;
-            of.w                        = 0.0f;
-            cSim.pForce4b[offset]       = of;
-//            cSim.pBornSum[offset]       = sum;
-        }         
-        else 
-        {        
-            // Read fixed atom data into registers and GRF
-            int j                       = y + tgx;
-            float4 temp                 = cSim.pPosq[j];
-            float2 temp1                = cSim.pObcData[j];
-            sA[threadIdx.x].fb          = cSim.pBornForce[j];
-            float3 af;
-            sA[threadIdx.x].fx = af.x   = 0.0f;
-            sA[threadIdx.x].fy = af.y   = 0.0f;
-            sA[threadIdx.x].fz = af.z   = 0.0f;
-//            sA[threadIdx.x].sum         = 0.0f;
-//            float sum                   = 0.0f;
-            float sr2                   = a.y * a.y;
-//            float oneOverR              = 1.0f / a.x;
-            sA[threadIdx.x].x           = temp.x;
-            sA[threadIdx.x].y           = temp.y;
-            sA[threadIdx.x].z           = temp.z;
-            sA[threadIdx.x].r           = temp1.x;
-            sA[threadIdx.x].sr          = temp1.y;
-            sA[threadIdx.x].sr2         = temp1.y * temp1.y;
-            for (j = 0; j < GRID; j++)
-            {
-                float dx                = psA[tj].x - apos.x; 
-                float dy                = psA[tj].y - apos.y; 
-                float dz                = psA[tj].z - apos.z; 
-                float r2                = dx * dx + dy * dy + dz * dz; 
-                float r                 = sqrt(r2);
-                
-                // Interleaved Atom I and J Born Forces and sum components
-                float r2Inverse         = 1.0f / r2;
-                float rScaledRadiusJ    = r + psA[tj].sr;
-                float rScaledRadiusI    = r + a.y;
-                float rInverse          = 1.0f / r;
-                float l_ijJ             = 1.0f / max(a.x, fabs(r - psA[tj].sr));
-                float l_ijI             = 1.0f / max(psA[tj].r, fabs(r - a.y));
-                float u_ijJ             = 1.0f / rScaledRadiusJ;
-                float u_ijI             = 1.0f / rScaledRadiusI;
-                float l_ij2J            = l_ijJ * l_ijJ;
-                float l_ij2I            = l_ijI * l_ijI;
-                float u_ij2J            = u_ijJ * u_ijJ;
-                float u_ij2I            = u_ijI * u_ijI;
-                float t1J               = log (u_ijJ / l_ijJ);
-                float t1I               = log (u_ijI / l_ijI);
-                float t2J               = (l_ij2J - u_ij2J);
-                float t2I               = (l_ij2I - u_ij2I);
-                float t3J               = t2J * rInverse;
-                float t3I               = t2I * rInverse;
-                t1J                    *= rInverse;
-                t1I                    *= rInverse;
-                   
-                // Born Forces term
-                float term              =  0.125f * 
-                                          (1.000f + psA[tj].sr2 * r2Inverse) * t3J + 
-                                           0.250f * t1J * r2Inverse;
-                float dE                = fb * term;
-                    
-                // Atom I Born sum term
-//                term                    =   l_ijJ - u_ijJ +
-//                                           -0.25f * r * t2J +
-//                                            0.50f * t1J +
-//                                           (0.25f * psA[tj].sr2) * t3J;
-//                if (a.x < (psA[tj].sr - r))
-//                {
-//                    term               += 2.0f * (oneOverR - l_ijJ);
-//                }
-                
-                if (a.x >= rScaledRadiusJ) 
-                {
-                    dE                  = /*term =*/ 0.0f;
-                }
-                
-                float d                 = dx * dE;
-                af.x                   -= d;
-                psA[tj].fx             += d;
-                d                       = dy * dE;  
-                af.y                   -= d;
-                psA[tj].fy             += d;
-                d                       = dz * dE;
-                af.z                   -= d;
-                psA[tj].fz             += d;                                          
-//                sum                    += term;
-               
-                // Atom J Born sum term               
-                term                    =  0.125f * 
-                                          (1.000f + sr2 * r2Inverse) * t3I + 
-                                           0.250f * t1I * r2Inverse;
-                dE                      = psA[tj].fb * term;  
-                
-//                term                    =  l_ijI - u_ijI +
-//                                          -0.25f * r * t2I +
-//                                           0.50f * t1I +
-//                                          (0.25f * sr2) * t3I;
-//                if (psA[tj].r < (a.y - r))
-//                {
-//                    term               += 2.0f * ((1.0f / psA[tj].r) - l_ijI);
-//                }
-                
-                if (psA[tj].r >= rScaledRadiusI) 
-                {           
-                    dE                  = /*term =*/ 0.0f;
-                }                             
-                dx                     *= dE;
-                dy                     *= dE;
-                dz                     *= dE;
-                psA[tj].fx             += dx; 
-                psA[tj].fy             += dy;
-                psA[tj].fz             += dz; 
-                af.x                   -= dx;
-                af.y                   -= dy;
-                af.z                   -= dz;    
-//                psA[tj].sum            += term;
-                tj                      = sNext[tj]; 
-            }
-                
-            // Write results
-            int offset                  = x + tgx + (y >> GRIDBITS) * cSim.stride;
-            float4 of;
-            of.x                        = af.x;
-            of.y                        = af.y;
-            of.z                        = af.z;
-            of.w                        = 0.0f;
-            cSim.pForce4b[offset]       = of;
-//            cSim.pBornSum[offset]       = sum;
-            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
-            of.x                        = sA[threadIdx.x].fx;
-            of.y                        = sA[threadIdx.x].fy;
-            of.z                        = sA[threadIdx.x].fz;
-            cSim.pForce4b[offset]       = of;
-//            cSim.pBornSum[offset]       = sA[threadIdx.x].sum;
-        }
-        pos                            -= cSim.bornForce2_workBlock;     
-    }
-}
-
-void kCalculateObcGbsaForces2_12(gpuContext gpu)
-{
-  //  printf("kCalculateObcGbsaForces2_12\n");
-    kCalculateObcGbsaForces2_12_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>();
-    LAUNCHERROR("kCalculateObcGbsaForces2_12");
-}