Added other files that somehow got ommitted from earlier checkin

630bdd40 · Peter Eastman · 9b7a9f65 · 630bdd40 · 630bdd40 · 630bdd40
Commit 630bdd40 authored Feb 20, 2009 by Peter Eastman
6 changed files
--- a/platforms/cuda/src/kernels/kCalculateCDLJForces.h
+++ b/platforms/cuda/src/kernels/kCalculateCDLJForces.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This file contains the kernels for evalauating nonbonded forces.  It is included
+ * several times in kCalculateCDLJForces.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+
+__global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit, int numWorkUnits)
+{
+    extern __shared__ Atom sA[];
+    unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    int pos = warp*numWorkUnits/totalWarps;
+    int end = (warp+1)*numWorkUnits/totalWarps;
+
+    int lasty = -1;
+    while (pos < end)
+    {
+
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x = workUnit[pos];
+        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
+        bool bExclusionFlag = (x & 0x1);
+        x = (x >> 17) << GRIDBITS;
+        float4      apos;   // Local atom x, y, z, q
+        float3      af;     // Local atom fx, fy, fz
+        float dx;
+        float dy;
+        float dz;
+        float r2;
+        float invR;
+        float sig;
+        float sig2;
+        float sig6;
+        float eps;
+        float dEdR;
+        unsigned int tgx = threadIdx.x & (GRID - 1);
+        unsigned int tbx = threadIdx.x - tgx;
+        int tj = tgx;
+        Atom* psA = &sA[tbx];
+        unsigned int i      = x + tgx;
+        apos                = cSim.pPosq[i];
+        float2 a            = cSim.pAttr[i];
+        af.x                = 0.0f;
+        af.y                = 0.0f;
+        af.z                = 0.0f;
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        {
+            // Read fixed atom data into registers and GRF
+            sA[threadIdx.x].x   = apos.x;
+            sA[threadIdx.x].y   = apos.y;
+            sA[threadIdx.x].z   = apos.z;
+            sA[threadIdx.x].q   = apos.w;
+            sA[threadIdx.x].sig = a.x;
+            sA[threadIdx.x].eps = a.y;
+            apos.w             *= cSim.epsfac;
+            if (!bExclusionFlag)
+            {
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    dx              = psA[j].x - apos.x;
+                    dy              = psA[j].y - apos.y;
+                    dz              = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    r2              = dx * dx + dy * dy + dz * dz;
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[j].sig;
+                    sig2            = invR * sig;
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2;
+                    eps             = a.y * psA[j].eps;
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
+#ifdef USE_CUTOFF
+                    dEdR           += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+#else
+                    dEdR           += apos.w * psA[j].q * invR;
+#endif
+                    dEdR           *= invR * invR;
+#ifdef USE_CUTOFF
+                    if (r2 > cSim.nonbondedCutoffSqr)
+                    {
+                        dEdR = 0.0f;
+                    }
+#endif
+                    dx             *= dEdR;
+                    dy             *= dEdR;
+                    dz             *= dEdR;
+                    af.x           -= dx;
+                    af.y           -= dy;
+                    af.z           -= dz;
+                }
+            }
+            else  // bExclusion
+            {
+                unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    dx              = psA[j].x - apos.x;
+                    dy              = psA[j].y - apos.y;
+                    dz              = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    r2              = dx * dx + dy * dy + dz * dz;
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[j].sig;
+                    sig2            = invR * sig;
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2;
+                    eps             = a.y * psA[j].eps;
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
+#ifdef USE_CUTOFF
+                    dEdR           += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+#else
+                    dEdR           += apos.w * psA[j].q * invR;
+#endif
+                    dEdR           *= invR * invR;
+#ifdef USE_CUTOFF
+                    if (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr)
+#else
+                    if (!(excl & 0x1))
+#endif
+                    {
+                        dEdR = 0.0f;
+                    }
+                    dx             *= dEdR;
+                    dy             *= dEdR;
+                    dz             *= dEdR;
+                    af.x           -= dx;
+                    af.y           -= dy;
+                    af.z           -= dz;
+                    excl          >>= 1;
+                }
+            }
+
+            // Write results
+            float4 of;
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            int offset                          = x + tgx + warp*cSim.stride;
+            of                                  = cSim.pForce4a[offset];
+            of.x                               += af.x;
+            of.y                               += af.y;
+            of.z                               += af.z;
+            cSim.pForce4a[offset]               = of;
+#else
+            of.x                                = af.x;
+            of.y                                = af.y;
+            of.z                                = af.z;
+            of.w                                = 0.0f;
+            int offset                          = x + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]               = of;
+#endif
+        }
+        else        // 100% utilization
+        {
+            // Read fixed atom data into registers and GRF
+            if (lasty != y)
+            {
+                int j                   = y + tgx;
+                float4 temp             = cSim.pPosq[j];
+                float2 temp1            = cSim.pAttr[j];
+                sA[threadIdx.x].x       = temp.x;
+                sA[threadIdx.x].y       = temp.y;
+                sA[threadIdx.x].z       = temp.z;
+                sA[threadIdx.x].q       = temp.w;
+                sA[threadIdx.x].sig     = temp1.x;
+                sA[threadIdx.x].eps     = temp1.y;
+            }
+            sA[threadIdx.x].fx      = 0.0f;
+            sA[threadIdx.x].fy      = 0.0f;
+            sA[threadIdx.x].fz      = 0.0f;
+            apos.w                 *= cSim.epsfac;
+            if (!bExclusionFlag)
+            {
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    dx              = psA[tj].x - apos.x;
+                    dy              = psA[tj].y - apos.y;
+                    dz              = psA[tj].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    r2              = dx * dx + dy * dy + dz * dz;
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[tj].sig;
+                    sig2            = invR * sig;
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2;
+                    eps             = a.y * psA[tj].eps;
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
+#ifdef USE_CUTOFF
+                    dEdR           += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+#else
+                    dEdR           += apos.w * psA[tj].q * invR;
+#endif
+                    dEdR           *= invR * invR;
+#ifdef USE_CUTOFF
+                    if (r2 > cSim.nonbondedCutoffSqr)
+                    {
+                        dEdR = 0.0f;
+                    }
+#endif
+                    dx             *= dEdR;
+                    dy             *= dEdR;
+                    dz             *= dEdR;
+                    af.x           -= dx;
+                    af.y           -= dy;
+                    af.z           -= dz;
+                    psA[tj].fx     += dx;
+                    psA[tj].fy     += dy;
+                    psA[tj].fz     += dz;
+                    tj              = (tj + 1) & (GRID - 1);
+                }
+            }
+            else  // bExclusion
+            {
+                // Read fixed atom data into registers and GRF
+                unsigned int excl       = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
+                excl                    = (excl >> tgx) | (excl << (GRID - tgx));
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    dx              = psA[tj].x - apos.x;
+                    dy              = psA[tj].y - apos.y;
+                    dz              = psA[tj].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    r2              = dx * dx + dy * dy + dz * dz;
+                    invR            = 1.0f / sqrt(r2);
+                    sig             = a.x + psA[tj].sig;
+                    sig2            = invR * sig;
+                    sig2           *= sig2;
+                    sig6            = sig2 * sig2 * sig2;
+                    eps             = a.y * psA[tj].eps;
+                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
+#ifdef USE_CUTOFF
+                    dEdR           += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+#else
+                    dEdR           += apos.w * psA[tj].q * invR;
+#endif
+                    dEdR           *= invR * invR;
+#ifdef USE_CUTOFF
+                    if (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr)
+#else
+                    if (!(excl & 0x1))
+#endif
+                    {
+                        dEdR = 0.0f;
+                    }
+                    dx             *= dEdR;
+                    dy             *= dEdR;
+                    dz             *= dEdR;
+                    af.x           -= dx;
+                    af.y           -= dy;
+                    af.z           -= dz;
+                    psA[tj].fx     += dx;
+                    psA[tj].fy     += dy;
+                    psA[tj].fz     += dz;
+                    excl          >>= 1;
+                    tj              = (tj + 1) & (GRID - 1);
+                }
+            }
+
+            // Write results
+            float4 of;
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            int offset                          = x + tgx + warp*cSim.stride;
+            of                                  = cSim.pForce4a[offset];
+            of.x                               += af.x;
+            of.y                               += af.y;
+            of.z                               += af.z;
+            cSim.pForce4a[offset]               = of;
+            offset                              = y + tgx + warp*cSim.stride;
+            of                                  = cSim.pForce4a[offset];
+            of.x                               += sA[threadIdx.x].fx;
+            of.y                               += sA[threadIdx.x].fy;
+            of.z                               += sA[threadIdx.x].fz;
+            cSim.pForce4a[offset]               = of;
+#else
+            of.x                                = af.x;
+            of.y                                = af.y;
+            of.z                                = af.z;
+            of.w                                = 0.0f;
+            int offset                          = x + tgx + (y >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]               = of;
+            of.x                                = sA[threadIdx.x].fx;
+            of.y                                = sA[threadIdx.x].fy;
+            of.z                                = sA[threadIdx.x].fz;
+            offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]               = of;
+#endif
+            lasty = y;
+        }
+
+        pos++;
+    }
+}
--- a/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.h
+++ b/platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This file contains the kernel for evalauating nonbonded forces and the first stage of GBSA.
+ * It is included several times in kCalculateCDLJObcGbsaForces1.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+
+__global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit, int numWorkUnits)
+{
+    extern __shared__ Atom sA[];
+    unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    int pos = warp*numWorkUnits/totalWarps;
+    int end = (warp+1)*numWorkUnits/totalWarps;
+
+    int lasty = -1;
+    while (pos < end)
+    {
+
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x                      = workUnit[pos];
+        unsigned int y                      = ((x >> 2) & 0x7fff) << GRIDBITS;
+        bool bExclusionFlag                 = (x & 0x1);
+        x                                   = (x >> 17) << GRIDBITS;
+        unsigned int tgx                    = threadIdx.x & (GRID - 1);
+        unsigned int i                      = x + tgx;
+        float4 apos                         = cSim.pPosq[i];
+        float2 a                            = cSim.pAttr[i];
+        float br                            = cSim.pBornRadii[i];
+        unsigned int tbx                    = threadIdx.x - tgx;
+        int tj                              = tgx;
+        Atom* psA                           = &sA[tbx];
+        float4 af;
+        af.x                        = 0.0f;
+        af.y                        = 0.0f;
+        af.z                        = 0.0f;
+        af.w                        = 0.0f;
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        {
+            // Read fixed atom data into registers and GRF
+            sA[threadIdx.x].x           = apos.x;
+            sA[threadIdx.x].y           = apos.y;
+            sA[threadIdx.x].z           = apos.z;
+            sA[threadIdx.x].q           = apos.w;
+            float q2                    = cSim.preFactor * apos.w;
+            apos.w                     *= cSim.epsfac;
+            sA[threadIdx.x].sig         = a.x;
+            sA[threadIdx.x].eps         = a.y;
+            sA[threadIdx.x].br          = br;
+            if (!bExclusionFlag)
+            {
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    float dx                = psA[j].x - apos.x;
+                    float dy                = psA[j].y - apos.y;
+                    float dz                = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    float r2                = dx * dx + dy * dy + dz * dz;
+
+                    // CDLJ part
+                    float invR              = 1.0f / sqrt(r2);
+                    float sig               = a.x + psA[j].sig;
+                    float sig2              = invR * sig;
+                    sig2                   *= sig2;
+                    float sig6              = sig2 * sig2 * sig2;
+                    float eps               = a.y * psA[j].eps;
+                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
+#ifdef USE_CUTOFF
+                    dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+#else
+                    dEdR                   += apos.w * psA[j].q * invR;
+#endif
+                    dEdR                   *= invR * invR;
+
+                    // ObcGbsaForce1 part
+                    float alpha2_ij         = br * psA[j].br;
+                    float D_ij              = r2 / (4.0f * alpha2_ij);
+                    float expTerm           = exp(-D_ij);
+                    float denominator2      = r2 + alpha2_ij * expTerm;
+                    float denominator       = sqrt(denominator2);
+                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
+                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
+                    af.w                   += dGpol_dalpha2_ij * psA[j].br;
+                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
+#ifdef USE_CUTOFF
+                    if (r2 > cSim.nonbondedCutoffSqr)
+                    {
+                        dEdR = 0.0f;
+                    }
+#endif
+
+                    // Add Forces
+                    dx                     *= dEdR;
+                    dy                     *= dEdR;
+                    dz                     *= dEdR;
+                    af.x                   -= dx;
+                    af.y                   -= dy;
+                    af.z                   -= dz;
+                }
+            }
+            else  // bExclusion
+            {
+                unsigned int excl           = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
+                for (unsigned int j = 0; j < GRID; j++)
+                {
+                    float dx                = psA[j].x - apos.x;
+                    float dy                = psA[j].y - apos.y;
+                    float dz                = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                        float r2                = dx * dx + dy * dy + dz * dz;
+
+                    // CDLJ part
+                    float invR              = 1.0f / sqrt(r2);
+                    float sig               = a.x + psA[j].sig;
+                    float sig2              = invR * sig;
+                    sig2                   *= sig2;
+                    float sig6              = sig2 * sig2 * sig2;
+                    float eps               = a.y * psA[j].eps;
+                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
+#ifdef USE_CUTOFF
+                    dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+#else
+                    dEdR                   += apos.w * psA[j].q * invR;
+#endif
+                    dEdR                   *= invR * invR;
+                    if (!(excl & 0x1))
+                    {
+                        dEdR = 0.0f;
+                    }
+
+                    // ObcGbsaForce1 part
+                    float alpha2_ij         = br * psA[j].br;
+                    float D_ij              = r2 / (4.0f * alpha2_ij);
+                    float expTerm           = exp(-D_ij);
+                    float denominator2      = r2 + alpha2_ij * expTerm;
+                    float denominator       = sqrt(denominator2);
+                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
+                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
+                    af.w                   += dGpol_dalpha2_ij * psA[j].br;
+                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
+#if defined USE_PERIODIC
+                    if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+                    {
+                        dEdR = 0.0f;
+                    }
+#elif defined USE_CUTOFF
+                    if (r2 > cSim.nonbondedCutoffSqr)
+                    {
+                        dEdR = 0.0f;
+                    }
+#endif
+
+                    // Add Forces
+                    dx                     *= dEdR;
+                    dy                     *= dEdR;
+                    dz                     *= dEdR;
+                    af.x                   -= dx;
+                    af.y                   -= dy;
+                    af.z                   -= dz;
+                    excl                  >>= 1;
+                }
+            }
+
+            // Write results
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            int offset                  = x + tgx + warp*cSim.stride;
+            float4 of                   = cSim.pForce4a[offset];
+            of.x                       += af.x;
+            of.y                       += af.y;
+            of.z                       += af.z;
+            of.w                       += af.w;
+            cSim.pForce4a[offset]       = of;
+            cSim.pBornForce[offset]     = af.w;
+#else
+            int offset                  = x + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]       = af;
+            cSim.pBornForce[offset]     = af.w;
+#endif
+        }
+        else        // 100% utilization
+        {
+            // Read fixed atom data into registers and GRF
+            if (lasty != y)
+            {
+                int j                       = y + tgx;
+                float4 temp                 = cSim.pPosq[j];
+                float2 temp1                = cSim.pAttr[j];
+                sA[threadIdx.x].br          = cSim.pBornRadii[j];
+                sA[threadIdx.x].x           = temp.x;
+                sA[threadIdx.x].y           = temp.y;
+                sA[threadIdx.x].z           = temp.z;
+                sA[threadIdx.x].q           = temp.w;
+                sA[threadIdx.x].sig         = temp1.x;
+                sA[threadIdx.x].eps         = temp1.y;
+            }
+            sA[threadIdx.x].fx          = 0.0f;
+            sA[threadIdx.x].fy          = 0.0f;
+            sA[threadIdx.x].fz          = 0.0f;
+            sA[threadIdx.x].fb          = 0.0f;
+            float q2                    = apos.w * cSim.preFactor;
+            apos.w                     *= cSim.epsfac;
+            if (!bExclusionFlag)
+            {
+                for (int j = 0; j < GRID; j++)
+                {
+                    float dx                = psA[tj].x - apos.x;
+                    float dy                = psA[tj].y - apos.y;
+                    float dz                = psA[tj].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    float r2                = dx * dx + dy * dy + dz * dz;
+
+                    // CDLJ part
+                    float invR              = 1.0f / sqrt(r2);
+                    float sig               = a.x + psA[tj].sig;
+                    float sig2              = invR * sig;
+                    sig2                   *= sig2;
+                    float sig6              = sig2 * sig2 * sig2;
+                    float eps               = a.y * psA[tj].eps;
+                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
+#ifdef USE_CUTOFF
+                    dEdR                   += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+#else
+                    dEdR                   += apos.w * psA[tj].q * invR;
+#endif
+                    dEdR                   *= invR * invR;
+
+                    // ObcGbsaForce1 part
+                    float alpha2_ij         = br * psA[tj].br;
+                    float D_ij              = r2 / (4.0f * alpha2_ij);
+                    float expTerm           = exp(-D_ij);
+                    float denominator2      = r2 + alpha2_ij * expTerm;
+                    float denominator       = sqrt(denominator2);
+                    float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
+                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
+                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;
+                    psA[tj].fb             += dGpol_dalpha2_ij * br;
+                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
+#ifdef USE_CUTOFF
+                    if (r2 > cSim.nonbondedCutoffSqr)
+                    {
+                        dEdR = 0.0f;
+                    }
+#endif
+
+                    // Add forces
+                    dx                     *= dEdR;
+                    dy                     *= dEdR;
+                    dz                     *= dEdR;
+                    af.x                   -= dx;
+                    af.y                   -= dy;
+                    af.z                   -= dz;
+                    psA[tj].fx             += dx;
+                    psA[tj].fy             += dy;
+                    psA[tj].fz             += dz;
+                    tj                      = (tj + 1) & (GRID - 1);
+                }
+            }
+            else  // bExclusion
+            {
+                unsigned int excl           = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
+                excl                        = (excl >> tgx) | (excl << (GRID - tgx));
+                for (int j = 0; j < GRID; j++)
+                {
+                    float dx                = psA[tj].x - apos.x;
+                    float dy                = psA[tj].y - apos.y;
+                    float dz                = psA[tj].z - apos.z;
+#ifdef USE_PERIODIC
+                    dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                    dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                    dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                    float r2                = dx * dx + dy * dy + dz * dz;
+
+                    // CDLJ part
+                    float invR              = 1.0f / sqrt(r2);
+                    float sig               = a.x + psA[tj].sig;
+                    float sig2              = invR * sig;
+                    sig2                   *= sig2;
+                    float sig6              = sig2 * sig2 * sig2;
+                    float eps               = a.y * psA[tj].eps;
+                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
+#ifdef USE_CUTOFF
+                    dEdR                   += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
+#else
+                    dEdR                   += apos.w * psA[tj].q * invR;
+#endif
+                    dEdR                   *= invR * invR;
+                    if (!(excl & 0x1))
+                    {
+                        dEdR = 0.0f;
+                    }
+
+                    // ObcGbsaForce1 part
+                    float alpha2_ij         = br * psA[tj].br;
+                    float D_ij              = r2 / (4.0f * alpha2_ij);
+                    float expTerm           = exp(-D_ij);
+                    float denominator2      = r2 + alpha2_ij * expTerm;
+                    float denominator       = sqrt(denominator2);
+                    float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
+                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
+                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;
+                    psA[tj].fb             += dGpol_dalpha2_ij * br;
+                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
+#if defined USE_PERIODIC
+                    if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+                    {
+                        dEdR = 0.0f;
+                    }
+#elif defined USE_CUTOFF
+                    if (r2 > cSim.nonbondedCutoffSqr)
+                    {
+                        dEdR = 0.0f;
+                    }
+#endif
+
+                    // Add forces
+                    dx                     *= dEdR;
+                    dy                     *= dEdR;
+                    dz                     *= dEdR;
+                    af.x                   -= dx;
+                    af.y                   -= dy;
+                    af.z                   -= dz;
+                    psA[tj].fx             += dx;
+                    psA[tj].fy             += dy;
+                    psA[tj].fz             += dz;
+                    excl                  >>= 1;
+                    tj                      = (tj + 1) & (GRID - 1);
+                }
+            }
+
+            // Write results
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            int offset                  = x + tgx + warp*cSim.stride;
+            float4 of                   = cSim.pForce4a[offset];
+            of.x                       += af.x;
+            of.y                       += af.y;
+            of.z                       += af.z;
+            of.w                       += af.w;
+            cSim.pForce4a[offset]       = of;
+            cSim.pBornForce[offset]     = af.w;
+            offset                      = y + tgx + warp*cSim.stride;
+            of                          = cSim.pForce4a[offset];
+            of.x                       += sA[threadIdx.x].fx;
+            of.y                       += sA[threadIdx.x].fy;
+            of.z                       += sA[threadIdx.x].fz;
+            of.w                       += sA[threadIdx.x].fb;
+            cSim.pForce4a[offset]       = of;
+            cSim.pBornForce[offset]     = af.w;
+#else
+            int offset                  = x + tgx + (y >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]       = af;
+            cSim.pBornForce[offset]     = af.w;
+            af.x                        = sA[threadIdx.x].fx;
+            af.y                        = sA[threadIdx.x].fy;
+            af.z                        = sA[threadIdx.x].fz;
+            af.w                        = sA[threadIdx.x].fb;
+            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pForce4a[offset]       = af;
+            cSim.pBornForce[offset]     = af.w;
+#endif
+            lasty = y;
+        }
+        pos++;
+    }
+}
--- a/platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.h
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaBornSum.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This file contains the kernel for calculating Born sums.  It is included
+ * several times in kCalculateObcGbsaBornSum.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+
+__global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit, int workUnits)
+{
+    extern __shared__ Atom sA[];
+    int end = workUnits / gridDim.x;
+    int pos = end - (threadIdx.x >> GRIDBITS) - 1;
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+#endif
+
+    while (pos >= 0)
+    {
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x = workUnit[pos + (blockIdx.x*workUnits)/gridDim.x];
+        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
+        x = (x >> 17) << GRIDBITS;
+        float       dx;
+        float       dy;
+        float       dz;
+        float       r2;
+        float       r;
+
+        unsigned int tgx = threadIdx.x & (GRID - 1);
+        unsigned int tbx = threadIdx.x - tgx;
+        int tj = tgx;
+        Atom* psA = &sA[tbx];
+
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        {
+            // Read fixed atom data into registers and GRF
+            unsigned int i = x + tgx;
+            float4 apos = cSim.pPosq[i];    // Local atom x, y, z, sum
+            float2 ar = cSim.pObcData[i];   // Local atom vr, sr
+            sA[threadIdx.x].x           = apos.x;
+            sA[threadIdx.x].y           = apos.y;
+            sA[threadIdx.x].z           = apos.z;
+            sA[threadIdx.x].r           = ar.x;
+            sA[threadIdx.x].sr          = ar.y;
+            apos.w                      = 0.0f;
+
+            for (unsigned int j = 0; j < GRID; j++)
+            {
+                dx                      = psA[j].x - apos.x;
+                dy                      = psA[j].y - apos.y;
+                dz                      = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                r2                      = dx * dx + dy * dy + dz * dz;
+#if defined USE_PERIODIC
+                if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
+#elif defined USE_CUTOFF
+                if (r2 < cSim.nonbondedCutoffSqr)
+#endif
+                {
+                    r                       = sqrt(r2);
+                    float rInverse          = 1.0f / r;
+                    float rScaledRadiusJ    = r + psA[j].sr;
+                    if ((j != tgx) && (ar.x < rScaledRadiusJ))
+                    {
+                        float l_ij     = 1.0f / max(ar.x, fabs(r - psA[j].sr));
+                        float u_ij     = 1.0f / rScaledRadiusJ;
+                        float l_ij2    = l_ij * l_ij;
+                        float u_ij2    = u_ij * u_ij;
+                        float ratio    = log(u_ij / l_ij);
+                        apos.w        += l_ij -
+                                         u_ij +
+                                         0.25f * r * (u_ij2 - l_ij2) +
+                                         (0.50f * rInverse * ratio) +
+                                         (0.25f * psA[j].sr * psA[j].sr * rInverse) *
+                                         (l_ij2 - u_ij2);
+
+                        if (ar.x < (psA[j].r - r))
+                        {
+                            apos.w += 2.0f * ((1.0f / ar.x) - l_ij);
+                        }
+                    }
+                }
+            }
+
+            // Write results
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            int offset = x + tgx + warp*cSim.stride;
+            cSim.pBornSum[offset] += apos.w;
+#else
+            int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pBornSum[offset] = apos.w;
+#endif
+        }
+        else        // 100% utilization
+        {
+            // Read fixed atom data into registers and GRF
+            int j                           = y + tgx;
+            unsigned int i                  = x + tgx;
+
+            float4 temp                     = cSim.pPosq[j];
+            float2 temp1                    = cSim.pObcData[j];
+            float4 apos                     = cSim.pPosq[i];        // Local atom x, y, z, sum
+            float2 ar                       = cSim.pObcData[i];    // Local atom vr, sr
+            sA[threadIdx.x].x               = temp.x;
+            sA[threadIdx.x].y               = temp.y;
+            sA[threadIdx.x].z               = temp.z;
+            sA[threadIdx.x].r               = temp1.x;
+            sA[threadIdx.x].sr              = temp1.y;
+            sA[threadIdx.x].sum = apos.w    = 0.0f;
+
+            for (unsigned int j = 0; j < GRID; j++)
+            {
+                dx                      = psA[tj].x - apos.x;
+                dy                      = psA[tj].y - apos.y;
+                dz                      = psA[tj].z - apos.z;
+#ifdef USE_PERIODIC
+                dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                r2                      = dx * dx + dy * dy + dz * dz;
+#ifdef USE_PERIODIC
+                if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
+#elif defined USE_CUTOFF
+                if (r2 < cSim.nonbondedCutoffSqr)
+#endif
+                {
+                    r                       = sqrt(r2);
+                    float rInverse          = 1.0f / r;
+                    float rScaledRadiusJ    = r + psA[tj].sr;
+                    if (ar.x < rScaledRadiusJ)
+                    {
+                        float l_ij     = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
+                        float u_ij     = 1.0f / rScaledRadiusJ;
+                        float l_ij2    = l_ij * l_ij;
+                        float u_ij2    = u_ij * u_ij;
+                        float ratio    = log(u_ij / l_ij);
+                        float term     = l_ij -
+                                         u_ij +
+                                         0.25f * r * (u_ij2 - l_ij2) +
+                                         (0.50f * rInverse * ratio) +
+                                         (0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
+                                         (l_ij2 - u_ij2);
+                        if (ar.x < (psA[tj].sr - r))
+                        {
+                            term += 2.0f * ((1.0f / ar.x) - l_ij);
+                        }
+                        apos.w        += term;
+                    }
+                    float rScaledRadiusI    = r + ar.y;
+                    if (psA[tj].r < rScaledRadiusI)
+                    {
+                        float l_ij     = 1.0f / max(psA[tj].r, fabs(r - ar.y));
+                        float u_ij     = 1.0f / rScaledRadiusI;
+                        float l_ij2    = l_ij * l_ij;
+                        float u_ij2    = u_ij * u_ij;
+                        float ratio    = log(u_ij / l_ij);
+                        float term     = l_ij -
+                                         u_ij +
+                                         0.25f * r * (u_ij2 - l_ij2) +
+                                         (0.50f * rInverse * ratio) +
+                                         (0.25f * ar.y * ar.y * rInverse) *
+                                         (l_ij2 - u_ij2);
+
+                        if (psA[tj].r < (ar.y - r))
+                        {
+                            term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
+                        }
+                        psA[tj].sum    += term;
+                    }
+                }
+                tj = (tj - 1) & (GRID - 1);
+            }
+
+            // Write results
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            int offset = x + tgx + warp*cSim.stride;
+            cSim.pBornSum[offset] += apos.w;
+            offset = y + tgx + warp*cSim.stride;
+            cSim.pBornSum[offset] += sA[threadIdx.x].sum;
+#else
+            int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
+            cSim.pBornSum[offset] = apos.w;
+            offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
+            cSim.pBornSum[offset] = sA[threadIdx.x].sum;
+#endif
+        }
+
+        pos -= cSim.nonbond_workBlock;
+    }
+}
--- a/platforms/cuda/src/kernels/kCalculateObcGbsaForces2.h
+++ b/platforms/cuda/src/kernels/kCalculateObcGbsaForces2.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This file contains the kernel for evalauating the second stage of GBSA.  It is included
+ * several times in kCalculateObcGbsaForces2.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+
+__global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit, int numWorkUnits)
+{
+    extern __shared__ Atom sA[];
+    unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+    int pos = warp*numWorkUnits/totalWarps;
+    int end = (warp+1)*numWorkUnits/totalWarps;
+
+    int lasty = -1;
+    while (pos < end)
+    {
+
+        // Extract cell coordinates from appropriate work unit
+        unsigned int x                  = workUnit[pos];
+        unsigned int y                  = ((x >> 2) & 0x7fff) << GRIDBITS;
+        x                               = (x >> 17) << GRIDBITS;
+        unsigned int tgx                = threadIdx.x & (GRID - 1);
+        unsigned int i                  = x + tgx;
+        float4 apos                     = cSim.pPosq[i];
+        float2 a                        = cSim.pObcData[i];
+        float fb                        = cSim.pBornForce[i];
+        unsigned int tbx                = threadIdx.x - tgx;
+        int tj                          = tgx;
+        Atom* psA                       = &sA[tbx];
+        float3 af;
+        sA[threadIdx.x].fx = af.x   = 0.0f;
+        sA[threadIdx.x].fy = af.y   = 0.0f;
+        sA[threadIdx.x].fz = af.z   = 0.0f;
+        if (x == y) // Handle diagonals uniquely at 50% efficiency
+        {
+            // Read fixed atom data into registers and GRF
+
+            sA[threadIdx.x].x           = apos.x;
+            sA[threadIdx.x].y           = apos.y;
+            sA[threadIdx.x].z           = apos.z;
+            sA[threadIdx.x].r           = a.x;
+            sA[threadIdx.x].sr          = a.y;
+            sA[threadIdx.x].sr2         = a.y * a.y;
+            sA[threadIdx.x].fb          = fb;
+
+            for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1))
+            {
+                float dx                = psA[j].x - apos.x;
+                float dy                = psA[j].y - apos.y;
+                float dz                = psA[j].z - apos.z;
+#ifdef USE_PERIODIC
+                dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                float r2                = dx * dx + dy * dy + dz * dz;
+                float r                 = sqrt(r2);
+
+
+                // Atom I Born forces and sum
+                float rScaledRadiusJ    = r + psA[j].sr;
+
+                float l_ij          = 1.0f / max(a.x, fabs(r - psA[j].sr));
+                float u_ij          = 1.0f / rScaledRadiusJ;
+                float rInverse      = 1.0f / r;
+                float l_ij2         = l_ij * l_ij;
+                float u_ij2         = u_ij * u_ij;
+                float r2Inverse     = rInverse * rInverse;
+                float t1            = log (u_ij / l_ij);
+                float t2            = (l_ij2 - u_ij2);
+                float t3            = t2 * rInverse;
+                t1                 *= rInverse;
+
+                // Born Forces term
+                float term          =  0.125f *
+                                      (1.000f + psA[j].sr2 * r2Inverse) * t3 +
+                                       0.250f * t1 * r2Inverse;
+                float dE            = fb * term;
+
+#if defined USE_PERIODIC
+                if (a.x >= rScaledRadiusJ || i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+#elif defined USE_CUTOFF
+                if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
+#else
+                if (a.x >= rScaledRadiusJ)
+#endif
+                {
+                    dE              = 0.0f;
+                }
+                float d             = dx * dE;
+                af.x               -= d;
+                psA[j].fx          += d;
+                d                   = dy * dE;
+                af.y               -= d;
+                psA[j].fy          += d;
+                d                   = dz * dE;
+                af.z               -= d;
+                psA[j].fz          += d;
+            }
+
+            // Write results
+            float4 of;
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            int offset                  = x + tgx + warp*cSim.stride;
+            of                          = cSim.pForce4b[offset];
+            of.x                       += af.x + sA[threadIdx.x].fx;
+            of.y                       += af.y + sA[threadIdx.x].fy;
+            of.z                       += af.z + sA[threadIdx.x].fz;
+            cSim.pForce4b[offset]       = of;
+#else
+            int offset                  = x + tgx + (x >> GRIDBITS) * cSim.stride;
+            of.x                        = af.x + sA[threadIdx.x].fx;
+            of.y                        = af.y + sA[threadIdx.x].fy;
+            of.z                        = af.z + sA[threadIdx.x].fz;
+            of.w                        = 0.0f;
+            cSim.pForce4b[offset]       = of;
+#endif
+        }
+        else
+        {
+            // Read fixed atom data into registers and GRF
+            if (lasty != y)
+            {
+                int j                       = y + tgx;
+                float4 temp                 = cSim.pPosq[j];
+                float2 temp1                = cSim.pObcData[j];
+                sA[threadIdx.x].fb          = cSim.pBornForce[j];
+                sA[threadIdx.x].x           = temp.x;
+                sA[threadIdx.x].y           = temp.y;
+                sA[threadIdx.x].z           = temp.z;
+                sA[threadIdx.x].r           = temp1.x;
+                sA[threadIdx.x].sr          = temp1.y;
+                sA[threadIdx.x].sr2         = temp1.y * temp1.y;
+            }
+            float sr2                   = a.y * a.y;
+            for (int j = 0; j < GRID; j++)
+            {
+                float dx                = psA[tj].x - apos.x;
+                float dy                = psA[tj].y - apos.y;
+                float dz                = psA[tj].z - apos.z;
+#ifdef USE_PERIODIC
+                dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+                dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+                dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+                float r2                = dx * dx + dy * dy + dz * dz;
+                float r                 = sqrt(r2);
+
+                // Interleaved Atom I and J Born Forces and sum components
+                float r2Inverse         = 1.0f / r2;
+                float rScaledRadiusJ    = r + psA[tj].sr;
+                float rScaledRadiusI    = r + a.y;
+                float rInverse          = 1.0f / r;
+                float l_ijJ             = 1.0f / max(a.x, fabs(r - psA[tj].sr));
+                float l_ijI             = 1.0f / max(psA[tj].r, fabs(r - a.y));
+                float u_ijJ             = 1.0f / rScaledRadiusJ;
+                float u_ijI             = 1.0f / rScaledRadiusI;
+                float l_ij2J            = l_ijJ * l_ijJ;
+                float l_ij2I            = l_ijI * l_ijI;
+                float u_ij2J            = u_ijJ * u_ijJ;
+                float u_ij2I            = u_ijI * u_ijI;
+                float t1J               = log (u_ijJ / l_ijJ);
+                float t1I               = log (u_ijI / l_ijI);
+                float t2J               = (l_ij2J - u_ij2J);
+                float t2I               = (l_ij2I - u_ij2I);
+                float t3J               = t2J * rInverse;
+                float t3I               = t2I * rInverse;
+                t1J                    *= rInverse;
+                t1I                    *= rInverse;
+
+                // Born Forces term
+                float term              =  0.125f *
+                                          (1.000f + psA[tj].sr2 * r2Inverse) * t3J +
+                                           0.250f * t1J * r2Inverse;
+                float dE                = fb * term;
+
+#if defined USE_PERIODIC
+                if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+#elif defined USE_CUTOFF
+                if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
+#else
+                if (a.x >= rScaledRadiusJ)
+#endif
+                {
+                    dE                  = 0.0f;
+                }
+
+                float d                 = dx * dE;
+                af.x                   -= d;
+                psA[tj].fx             += d;
+                d                       = dy * dE;
+                af.y                   -= d;
+                psA[tj].fy             += d;
+                d                       = dz * dE;
+                af.z                   -= d;
+                psA[tj].fz             += d;
+
+                // Atom J Born sum term
+                term                    =  0.125f *
+                                          (1.000f + sr2 * r2Inverse) * t3I +
+                                           0.250f * t1I * r2Inverse;
+                dE                      = psA[tj].fb * term;
+
+#ifdef USE_PERIODIC
+                if (psA[tj].r >= rScaledRadiusI || i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
+#elif defined USE_CUTOFF
+                if (psA[tj].r >= rScaledRadiusI || r2 > cSim.nonbondedCutoffSqr)
+#else
+                if (psA[tj].r >= rScaledRadiusI)
+#endif
+                {
+                    dE                  = 0.0f;
+                }
+                dx                     *= dE;
+                dy                     *= dE;
+                dz                     *= dE;
+                psA[tj].fx             += dx;
+                psA[tj].fy             += dy;
+                psA[tj].fz             += dz;
+                af.x                   -= dx;
+                af.y                   -= dy;
+                af.z                   -= dz;
+                tj                      = (tj + 1) & (GRID - 1);
+            }
+
+            // Write results
+            float4 of;
+#ifdef USE_OUTPUT_BUFFER_PER_WARP
+            int offset                  = x + tgx + warp*cSim.stride;
+            of                          = cSim.pForce4b[offset];
+            of.x                       += af.x;
+            of.y                       += af.y;
+            of.z                       += af.z;
+            cSim.pForce4b[offset]       = of;
+            offset                      = y + tgx + warp*cSim.stride;
+            of                          = cSim.pForce4b[offset];
+            of.x                       += sA[threadIdx.x].fx;
+            of.y                       += sA[threadIdx.x].fy;
+            of.z                       += sA[threadIdx.x].fz;
+            cSim.pForce4b[offset]       = of;
+#else
+            int offset                  = x + tgx + (y >> GRIDBITS) * cSim.stride;
+            of.x                        = af.x;
+            of.y                        = af.y;
+            of.z                        = af.z;
+            of.w                        = 0.0f;
+            cSim.pForce4b[offset]       = of;
+            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
+            of.x                        = sA[threadIdx.x].fx;
+            of.y                        = sA[threadIdx.x].fy;
+            of.z                        = sA[threadIdx.x].fz;
+            cSim.pForce4b[offset]       = of;
+#endif
+        }
+        lasty = y;
+        pos++;
+    }
+}
--- a/platforms/cuda/src/kernels/kFindInteractingBlocks.h
+++ b/platforms/cuda/src/kernels/kFindInteractingBlocks.h
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This file contains the kernels for identifying interacting blocks.  It is included
+ * several times in kCalculateCDLJForces.cu with different #defines to generate
+ * different versions of the kernels.
+ */
+
+/**
+ * Find a bounding box for the atoms in each block.
+ */
+__global__ void METHOD_NAME(kFindBlockBounds, _kernel)()
+{
+    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int base = pos << GRIDBITS;
+    if (base < cSim.atoms)
+    {
+        float4 apos = cSim.pPosq[base];
+#ifdef USE_PERIODIC
+        apos.x -= floor(apos.x/cSim.periodicBoxSizeX)*cSim.periodicBoxSizeX;
+        apos.y -= floor(apos.y/cSim.periodicBoxSizeY)*cSim.periodicBoxSizeY;
+        apos.z -= floor(apos.z/cSim.periodicBoxSizeZ)*cSim.periodicBoxSizeZ;
+        float4 firstPoint = apos;
+#endif
+        float minx = apos.x;
+        float maxx = apos.x;
+        float miny = apos.y;
+        float maxy = apos.y;
+        float minz = apos.z;
+        float maxz = apos.z;
+        for (unsigned int i = 1; i < GRID; i++)
+        {
+            apos = cSim.pPosq[base+i];
+#ifdef USE_PERIODIC
+            apos.x -= floor((apos.x-firstPoint.x)/cSim.periodicBoxSizeX+0.5)*cSim.periodicBoxSizeX;
+            apos.y -= floor((apos.y-firstPoint.y)/cSim.periodicBoxSizeY+0.5)*cSim.periodicBoxSizeY;
+            apos.z -= floor((apos.z-firstPoint.z)/cSim.periodicBoxSizeZ+0.5)*cSim.periodicBoxSizeZ;
+#endif
+            minx = min(minx, apos.x);
+            maxx = max(maxx, apos.x);
+            miny = min(miny, apos.y);
+            maxy = max(maxy, apos.y);
+            minz = min(minz, apos.z);
+            maxz = max(maxz, apos.z);
+        }
+        cSim.pGridBoundingBox[pos] = make_float4(0.5f*(maxx-minx), 0.5f*(maxy-miny), 0.5f*(maxz-minz), 0);
+        cSim.pGridCenter[pos] = make_float4(0.5f*(maxx+minx), 0.5f*(maxy+miny), 0.5f*(maxz+minz), 0);
+    }
+}
+
+/**
+ * Compare the bounding boxes for each pair of blocks.  If they are sufficiently far apart,
+ * mark them as non-interacting.
+ */
+__global__ void METHOD_NAME(kFindBlocksWithInteractions, _kernel)()
+{
+    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
+    if (pos < cSim.workUnits)
+    {
+        // Extract cell coordinates from appropriate work unit
+        
+        unsigned int x = cSim.pWorkUnit[pos];
+        unsigned int y = ((x >> 2) & 0x7fff);
+        x = (x >> 17);
+
+        // Find the distance between the bounding boxes of the two cells.
+
+        float4 centera = cSim.pGridCenter[x];
+        float4 centerb = cSim.pGridCenter[y];
+        float dx = centera.x-centerb.x;
+        float dy = centera.y-centerb.y;
+        float dz = centera.z-centerb.z;
+#ifdef USE_PERIODIC
+        dx -= floor(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
+        dy -= floor(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
+        dz -= floor(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
+#endif
+        float4 boxSizea = cSim.pGridBoundingBox[x];
+        float4 boxSizeb = cSim.pGridBoundingBox[y];
+        dx = max(0.0f, abs(dx)-boxSizea.x-boxSizeb.x);
+        dy = max(0.0f, abs(dy)-boxSizea.y-boxSizeb.y);
+        dz = max(0.0f, abs(dz)-boxSizea.z-boxSizeb.z);
+        cSim.pInteractionFlag[pos] = (dx*dx+dy*dy+dz*dz > cSim.nonbondedCutoffSqr ? 0 : 1);
+    }
+}
--- a/platforms/cuda/src/kernels/kSettle.cu
+++ b/platforms/cuda/src/kernels/kSettle.cu
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Authors: Scott Le Grand, Peter Eastman                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <vector_functions.h>
+#include <cstdlib>
+#include <string>
+#include <iostream>
+//#include <fstream>
+using namespace std;
+
+#define DeltaShake
+
+#include "gputypes.h"
+
+
+static __constant__ cudaGmxSimulation cSim;
+
+void SetSettleSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+void GetSettleSim(gpuContext gpu)
+{
+    cudaError_t status;
+    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
+    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
+}
+
+/**
+ * This is based on the setlep FORTRAN routine by Shuichi Miyamoto.  See
+ * S. Miyamoto and P. Kollman, J. Comp. Chem., vol 13, no. 8, pp. 952-962 (1992).
+ */
+
+__global__ void kApplyFirstSettle_kernel()
+{
+    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
+    while (pos < cSim.settleConstraints)
+    {
+        int4 atomID         = cSim.pSettleID[pos];
+        float2 params       = cSim.pSettleParameter[pos];
+        float4 apos0        = cSim.pOldPosq[atomID.x];
+        float4 xp0          = cSim.pPosqP[atomID.x];
+        float4 apos1        = cSim.pOldPosq[atomID.y];
+        float4 xp1          = cSim.pPosqP[atomID.y];
+        float4 apos2        = cSim.pOldPosq[atomID.z];
+        float4 xp2          = cSim.pPosqP[atomID.z];
+        float m0            = 1.0/cSim.pVelm4[atomID.x].w;
+        float m1            = 1.0/cSim.pVelm4[atomID.y].w;
+        float m2            = 1.0/cSim.pVelm4[atomID.z].w;
+
+
+        float xb0 = apos1.x-apos0.x;
+        float yb0 = apos1.y-apos0.y;
+        float zb0 = apos1.z-apos0.z;
+        float xc0 = apos2.x-apos0.x;
+        float yc0 = apos2.y-apos0.y;
+        float zc0 = apos2.z-apos0.z;
+        float dist1 = sqrt(xb0*xb0 + yb0*yb0 + zb0*zb0);
+        float dist2 = sqrt(xc0*xc0 + yc0*yc0 + zc0*zc0);
+
+        float totalMass = m0+m1+m2;
+        float xcom = ((apos0.x+xp0.x)*m0 + (apos1.x+xp1.x)*m1 + (apos2.x+xp2.x)*m2) / totalMass;
+        float ycom = ((apos0.y+xp0.y)*m0 + (apos1.y+xp1.y)*m1 + (apos2.y+xp2.y)*m2) / totalMass;
+        float zcom = ((apos0.z+xp0.z)*m0 + (apos1.z+xp1.z)*m1 + (apos2.z+xp2.z)*m2) / totalMass;
+
+        float xa1 = apos0.x + xp0.x - xcom;
+        float ya1 = apos0.y + xp0.y - ycom;
+        float za1 = apos0.z + xp0.z - zcom;
+        float xb1 = apos1.x + xp1.x - xcom;
+        float yb1 = apos1.y + xp1.y - ycom;
+        float zb1 = apos1.z + xp1.z - zcom;
+        float xc1 = apos2.x + xp2.x - xcom;
+        float yc1 = apos2.y + xp2.y - ycom;
+        float zc1 = apos2.z + xp2.z - zcom;
+
+        float xaksZd = yb0*zc0 - zb0*yc0;
+        float yaksZd = zb0*xc0 - xb0*zc0;
+        float zaksZd = xb0*yc0 - yb0*xc0;
+        float xaksXd = ya1*zaksZd - za1*yaksZd;
+        float yaksXd = za1*xaksZd - xa1*zaksZd;
+        float zaksXd = xa1*yaksZd - ya1*xaksZd;
+        float xaksYd = yaksZd*zaksXd - zaksZd*yaksXd;
+        float yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
+        float zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
+
+        float axlng = sqrt(xaksXd * xaksXd + yaksXd * yaksXd + zaksXd * zaksXd);
+        float aylng = sqrt(xaksYd * xaksYd + yaksYd * yaksYd + zaksYd * zaksYd);
+        float azlng = sqrt(xaksZd * xaksZd + yaksZd * yaksZd + zaksZd * zaksZd);
+        float trns11 = xaksXd / axlng;
+        float trns21 = yaksXd / axlng;
+        float trns31 = zaksXd / axlng;
+        float trns12 = xaksYd / aylng;
+        float trns22 = yaksYd / aylng;
+        float trns32 = zaksYd / aylng;
+        float trns13 = xaksZd / azlng;
+        float trns23 = yaksZd / azlng;
+        float trns33 = zaksZd / azlng;
+
+        float xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0;
+        float yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0;
+        float xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0;
+        float yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0;
+//        float xa1d = trns11*xa1 + trns21*ya1 + trns31*za1;
+//        float ya1d = trns12*xa1 + trns22*ya1 + trns32*za1;
+        float za1d = trns13*xa1 + trns23*ya1 + trns33*za1;
+        float  xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1;
+        float  yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1;
+        float  zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1;
+        float  xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1;
+        float  yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1;
+        float  zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1;
+
+        //                                        --- Step2  A2' ---
+
+        float rc = 0.5*params.y;
+        float rb = sqrt(params.x*params.x-rc*rc);
+        float ra = rb*(m1+m2)/totalMass;
+        rb -= ra;
+        float sinphi = za1d / ra;
+        float cosphi = sqrt(1.0 - sinphi*sinphi);
+        float sinpsi = ( zb1d - zc1d ) / (2*rc*cosphi);
+        float cospsi = sqrt(1.0 - sinpsi*sinpsi);
+
+        float ya2d =   ra * cosphi;
+        float xb2d = - rc * cospsi;
+        float yb2d = - rb * cosphi - rc *sinpsi * sinphi;
+        float yc2d = - rb * cosphi + rc *sinpsi * sinphi;
+        float xb2d2 = xb2d * xb2d;
+        float hh2 = 4.0 * xb2d2 + (yb2d-yc2d) * (yb2d-yc2d) + (zb1d-zc1d) * (zb1d-zc1d);
+        float deltx = 2.0 * xb2d + sqrt ( 4.0 * xb2d2 - hh2 + params.y*params.y );
+        xb2d -= deltx * 0.5;
+
+        //                                        --- Step3  al,be,ga ---
+
+        float alpa = ( xb2d * (xb0d-xc0d) + yb0d * yb2d + yc0d * yc2d );
+        float beta = ( xb2d * (yc0d-yb0d) + xb0d * yb2d + xc0d * yc2d );
+        float gama = xb0d * yb1d - xb1d * yb0d + xc0d * yc1d - xc1d * yc0d;
+
+        float al2be2 = alpa * alpa + beta * beta;
+        float sinthe = ( alpa*gama - beta * sqrt ( al2be2 - gama * gama ) ) / al2be2;
+
+        //                                        --- Step4  A3' ---
+
+        float costhe = sqrt (1.0 - sinthe * sinthe );
+        float xa3d = - ya2d * sinthe;
+        float ya3d =   ya2d * costhe;
+        float za3d = za1d;
+        float xb3d =   xb2d * costhe - yb2d * sinthe;
+        float yb3d =   xb2d * sinthe + yb2d * costhe;
+        float zb3d = zb1d;
+        float xc3d = - xb2d * costhe - yc2d * sinthe;
+        float yc3d = - xb2d * sinthe + yc2d * costhe;
+        float zc3d = zc1d;
+
+        //                                        --- Step5  A3 ---
+
+        float xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d;
+        float ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d;
+        float za3 = trns31*xa3d + trns32*ya3d + trns33*za3d;
+        float xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d;
+        float yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d;
+        float zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d;
+        float xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d;
+        float yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d;
+        float zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d;
+
+        xp0.x = xcom + xa3 - apos0.x;
+        xp0.y = ycom + ya3 - apos0.y;
+        xp0.z = zcom + za3 - apos0.z;
+        xp1.x = xcom + xb3 - apos1.x;
+        xp1.y = ycom + yb3 - apos1.y;
+        xp1.z = zcom + zb3 - apos1.z;
+        xp2.x = xcom + xc3 - apos2.x;
+        xp2.y = ycom + yc3 - apos2.y;
+        xp2.z = zcom + zc3 - apos2.z;
+
+
+        cSim.pPosqP[atomID.x] = xp0;
+        cSim.pPosqP[atomID.y] = xp1;
+        cSim.pPosqP[atomID.z] = xp2;
+
+        pos += blockDim.x * gridDim.x;
+    }
+}
+
+void kApplyFirstSettle(gpuContext gpu)
+{
+//    printf("kApplyFirstSettle\n");
+    if (gpu->sim.settleConstraints > 0)
+    {
+        kApplyFirstSettle_kernel<<<gpu->sim.blocks, gpu->sim.settle_threads_per_block>>>();
+        LAUNCHERROR("kApplyFirstSettle");
+    }
+}
+
+__global__ void kApplySecondSettle_kernel()
+{
+    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
+    while (pos < cSim.settleConstraints)
+    {
+        int4 atomID         = cSim.pSettleID[pos];
+        float2 params       = cSim.pSettleParameter[pos];
+        float4 apos0        = cSim.pOldPosq[atomID.x];
+        float4 xp0          = cSim.pPosq[atomID.x];
+        float4 apos1        = cSim.pOldPosq[atomID.y];
+        float4 xp1          = cSim.pPosq[atomID.y];
+        float4 apos2        = cSim.pOldPosq[atomID.z];
+        float4 xp2          = cSim.pPosq[atomID.z];
+        float m0            = 1.0/cSim.pVelm4[atomID.x].w;
+        float m1            = 1.0/cSim.pVelm4[atomID.y].w;
+        float m2            = 1.0/cSim.pVelm4[atomID.z].w;
+
+
+        float xb0 = apos1.x-apos0.x;
+        float yb0 = apos1.y-apos0.y;
+        float zb0 = apos1.z-apos0.z;
+        float xc0 = apos2.x-apos0.x;
+        float yc0 = apos2.y-apos0.y;
+        float zc0 = apos2.z-apos0.z;
+        float dist1 = sqrt(xb0*xb0 + yb0*yb0 + zb0*zb0);
+        float dist2 = sqrt(xc0*xc0 + yc0*yc0 + zc0*zc0);
+
+        float totalMass = m0+m1+m2;
+        float xcom = ((apos0.x+xp0.x)*m0 + (apos1.x+xp1.x)*m1 + (apos2.x+xp2.x)*m2) / totalMass;
+        float ycom = ((apos0.y+xp0.y)*m0 + (apos1.y+xp1.y)*m1 + (apos2.y+xp2.y)*m2) / totalMass;
+        float zcom = ((apos0.z+xp0.z)*m0 + (apos1.z+xp1.z)*m1 + (apos2.z+xp2.z)*m2) / totalMass;
+
+        float xa1 = apos0.x + xp0.x - xcom;
+        float ya1 = apos0.y + xp0.y - ycom;
+        float za1 = apos0.z + xp0.z - zcom;
+        float xb1 = apos1.x + xp1.x - xcom;
+        float yb1 = apos1.y + xp1.y - ycom;
+        float zb1 = apos1.z + xp1.z - zcom;
+        float xc1 = apos2.x + xp2.x - xcom;
+        float yc1 = apos2.y + xp2.y - ycom;
+        float zc1 = apos2.z + xp2.z - zcom;
+
+        float xaksZd = yb0*zc0 - zb0*yc0;
+        float yaksZd = zb0*xc0 - xb0*zc0;
+        float zaksZd = xb0*yc0 - yb0*xc0;
+        float xaksXd = ya1*zaksZd - za1*yaksZd;
+        float yaksXd = za1*xaksZd - xa1*zaksZd;
+        float zaksXd = xa1*yaksZd - ya1*xaksZd;
+        float xaksYd = yaksZd*zaksXd - zaksZd*yaksXd;
+        float yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
+        float zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
+
+        float axlng = sqrt(xaksXd * xaksXd + yaksXd * yaksXd + zaksXd * zaksXd);
+        float aylng = sqrt(xaksYd * xaksYd + yaksYd * yaksYd + zaksYd * zaksYd);
+        float azlng = sqrt(xaksZd * xaksZd + yaksZd * yaksZd + zaksZd * zaksZd);
+        float trns11 = xaksXd / axlng;
+        float trns21 = yaksXd / axlng;
+        float trns31 = zaksXd / axlng;
+        float trns12 = xaksYd / aylng;
+        float trns22 = yaksYd / aylng;
+        float trns32 = zaksYd / aylng;
+        float trns13 = xaksZd / azlng;
+        float trns23 = yaksZd / azlng;
+        float trns33 = zaksZd / azlng;
+
+        float xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0;
+        float yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0;
+        float xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0;
+        float yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0;
+//        float xa1d = trns11*xa1 + trns21*ya1 + trns31*za1;
+//        float ya1d = trns12*xa1 + trns22*ya1 + trns32*za1;
+        float za1d = trns13*xa1 + trns23*ya1 + trns33*za1;
+        float  xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1;
+        float  yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1;
+        float  zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1;
+        float  xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1;
+        float  yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1;
+        float  zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1;
+
+        //                                        --- Step2  A2' ---
+
+        float rc = 0.5*params.y;
+        float rb = sqrt(params.x*params.x-rc*rc);
+        float ra = rb*(m1+m2)/totalMass;
+        rb -= ra;
+        float sinphi = za1d / ra;
+        float cosphi = sqrt(1.0 - sinphi*sinphi);
+        float sinpsi = ( zb1d - zc1d ) / (2*rc*cosphi);
+        float cospsi = sqrt(1.0 - sinpsi*sinpsi);
+
+        float ya2d =   ra * cosphi;
+        float xb2d = - rc * cospsi;
+        float yb2d = - rb * cosphi - rc *sinpsi * sinphi;
+        float yc2d = - rb * cosphi + rc *sinpsi * sinphi;
+        float xb2d2 = xb2d * xb2d;
+        float hh2 = 4.0 * xb2d2 + (yb2d-yc2d) * (yb2d-yc2d) + (zb1d-zc1d) * (zb1d-zc1d);
+        float deltx = 2.0 * xb2d + sqrt ( 4.0 * xb2d2 - hh2 + params.y*params.y );
+        xb2d -= deltx * 0.5;
+
+        //                                        --- Step3  al,be,ga ---
+
+        float alpa = ( xb2d * (xb0d-xc0d) + yb0d * yb2d + yc0d * yc2d );
+        float beta = ( xb2d * (yc0d-yb0d) + xb0d * yb2d + xc0d * yc2d );
+        float gama = xb0d * yb1d - xb1d * yb0d + xc0d * yc1d - xc1d * yc0d;
+
+        float al2be2 = alpa * alpa + beta * beta;
+        float sinthe = ( alpa*gama - beta * sqrt ( al2be2 - gama * gama ) ) / al2be2;
+
+        //                                        --- Step4  A3' ---
+
+        float costhe = sqrt (1.0 - sinthe * sinthe );
+        float xa3d = - ya2d * sinthe;
+        float ya3d =   ya2d * costhe;
+        float za3d = za1d;
+        float xb3d =   xb2d * costhe - yb2d * sinthe;
+        float yb3d =   xb2d * sinthe + yb2d * costhe;
+        float zb3d = zb1d;
+        float xc3d = - xb2d * costhe - yc2d * sinthe;
+        float yc3d = - xb2d * sinthe + yc2d * costhe;
+        float zc3d = zc1d;
+
+        //                                        --- Step5  A3 ---
+
+        float xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d;
+        float ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d;
+        float za3 = trns31*xa3d + trns32*ya3d + trns33*za3d;
+        float xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d;
+        float yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d;
+        float zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d;
+        float xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d;
+        float yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d;
+        float zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d;
+
+        xp0.x = xcom + xa3;
+        xp0.y = ycom + ya3;
+        xp0.z = zcom + za3;
+        xp1.x = xcom + xb3;
+        xp1.y = ycom + yb3;
+        xp1.z = zcom + zb3;
+        xp2.x = xcom + xc3;
+        xp2.y = ycom + yc3;
+        xp2.z = zcom + zc3;
+
+
+        cSim.pPosq[atomID.x] = xp0;
+        cSim.pPosq[atomID.y] = xp1;
+        cSim.pPosq[atomID.z] = xp2;
+
+        pos += blockDim.x * gridDim.x;
+    }
+}
+
+void kApplySecondSettle(gpuContext gpu)
+{
+//    printf("kApplySecondSettle\n");
+    if (gpu->sim.settleConstraints > 0)
+    {
+        kApplySecondSettle_kernel<<<gpu->sim.blocks, gpu->sim.settle_threads_per_block>>>();
+        LAUNCHERROR("kApplySecondSettle");
+    }
+}