Deleted the old CUDA platform

2e451b9d · Peter Eastman · 352e2fc7 · 352e2fc7 · 352e2fc7 · 352e2fc7
Commit 2e451b9d authored Dec 13, 2012 by Peter Eastman
20 changed files
--- a/platforms/cuda-old/src/kernels/kCalculateGBVIForces2.cu
+++ b/platforms/cuda-old/src/kernels/kCalculateGBVIForces2.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-#include <sstream>
-using namespace std;
-#include "gputypes.h"
-#include "cudaKernels.h"
-struct Atom {
-    float x;
-    float y;
-    float z;
-    float r;
-    float sr;
-    float fx;
-    float fy;
-    float fz;
-    float fb;
-};
-static __constant__ cudaGmxSimulation cSim;
-void SetCalculateGBVIForces2Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-void GetCalculateGBVIForces2Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-#include "kCalculateGBVIAux.h"
-// Include versions of the kernels for N^2 calculations.
-#define METHOD_NAME(a, b) a##N2##b
-#include "kCalculateGBVIForces2.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateGBVIForces2.h"
-// Include versions of the kernels with cutoffs.
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_CUTOFF
-#define METHOD_NAME(a, b) a##Cutoff##b
-#include "kCalculateGBVIForces2.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##CutoffByWarp##b
-#include "kCalculateGBVIForces2.h"
-// Include versions of the kernels with periodic boundary conditions.
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_PERIODIC
-#define METHOD_NAME(a, b) a##Periodic##b
-#include "kCalculateGBVIForces2.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##PeriodicByWarp##b
-#include "kCalculateGBVIForces2.h"
-void kCalculateGBVIForces2(gpuContext gpu)
-{
-    switch (gpu->sim.nonbondedMethod)
-    {
-        case NO_CUTOFF:
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVIN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit );
-            else
-                kCalculateGBVIN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit );
-            break;
-        case CUTOFF:
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVICutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
-            else
-                kCalculateGBVICutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            break;
-        case PERIODIC:
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVIPeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
-            else
-                kCalculateGBVIPeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
-            break;
-    }
-    LAUNCHERROR("kCalculateGBVIForces2");
-    //kPrintGBVI( gpu, "kCalculateGBVIForces2", 0, stderr);
-}
--- a/platforms/cuda-old/src/kernels/kCalculateGBVIForces2.h
+++ b/platforms/cuda-old/src/kernels/kCalculateGBVIForces2.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-#include "kCalculateGBVIAux.h"
-/**
- * This file contains the kernel for evalauating the second stage of GBSA.  It is included
- * several times in kCalculateGBVIForces2.cu with different #defines to generate
- * different versions of the kernels.
- */
-__global__ void 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1)
-#endif
-METHOD_NAME(kCalculateGBVI, Forces2_kernel)(unsigned int* workUnit )
-{
-    extern __shared__ volatile Atom sA[];
-    unsigned int totalWarps   = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
-    unsigned int warp         = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    unsigned int numWorkUnits = cSim.pInteractionCount[0];
-    unsigned int pos          = warp*numWorkUnits/totalWarps;
-    unsigned int end          = (warp+1)*numWorkUnits/totalWarps;
-#ifdef USE_CUTOFF
-    volatile float3* tempBuffer = (float3*) &sA[cSim.bornForce2_threads_per_block];
-#endif
-    unsigned int lasty = -0xFFFFFFFF;
-    while (pos < end)
-    {
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x                  = workUnit[pos];
-        unsigned int y                  = ((x >> 2) & 0x7fff) << GRIDBITS;
-        x                               = (x >> 17) << GRIDBITS;
-        unsigned int tgx                = threadIdx.x & (GRID - 1);
-        unsigned int i                  = x + tgx;
-        float4 apos                     = cSim.pPosq[i];
-        float4 ar                       = cSim.pGBVIData[i];
-        float fb                        = cSim.pBornForce[i];
-        unsigned int tbx                = threadIdx.x - tgx;
-        unsigned int tj                 = tgx;
-        volatile Atom* psA              = &sA[tbx];
-        float3 af;
-        sA[threadIdx.x].fx = af.x   = 0.0f;
-        sA[threadIdx.x].fy = af.y   = 0.0f;
-        sA[threadIdx.x].fz = af.z   = 0.0f;
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        {
-            // Read fixed atom data into registers and GRF
-            sA[threadIdx.x].x           = apos.x;
-            sA[threadIdx.x].y           = apos.y;
-            sA[threadIdx.x].z           = apos.z;
-            sA[threadIdx.x].r           = ar.x;
-            sA[threadIdx.x].sr          = ar.y;
-            sA[threadIdx.x].fb          = fb;
-            for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1))
-            {
-                float dx                = psA[j].x - apos.x;
-                float dy                = psA[j].y - apos.y;
-                float dz                = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                dx                     -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                dy                     -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                dz                     -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                float r2                = dx * dx + dy * dy + dz * dz;
-                float r                 = sqrtf(r2);
-                // Atom I Born forces and sum
-                float dE                = getGBVI_dE2( r, ar.x, psA[j].sr, fb );
-#if defined USE_CUTOFF
-                if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                if (i >= cSim.atoms || x+j >= cSim.atoms )
-#endif
-                {
-                    dE              = 0.0f;
-                }
-                float d             = dx * dE;
-                af.x               -= d;
-                psA[j].fx          += d;
-                d                   = dy * dE;
-                af.y               -= d;
-                psA[j].fy          += d;
-                d                   = dz * dE;
-                af.z               -= d;
-                psA[j].fz          += d;
-            }
-            // Write results
-            float4 of;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset         = x + tgx + warp*cSim.stride;
-#else
-            unsigned int offset         = x + tgx + (x >> GRIDBITS) * cSim.stride;
-#endif
-            of                          = cSim.pForce4[offset];
-            of.x                       += af.x + sA[threadIdx.x].fx;
-            of.y                       += af.y + sA[threadIdx.x].fy;
-            of.z                       += af.z + sA[threadIdx.x].fz;
-            cSim.pForce4[offset]       = of;
-        } else {
-            // Read fixed atom data into registers and GRF
-            if (lasty != y)
-            {
-                unsigned int j              = y + tgx;
-                float4 temp                 = cSim.pPosq[j];
-                float4 temp1                = cSim.pGBVIData[j];
-                float fb                    = cSim.pBornForce[j];
-                sA[threadIdx.x].fb          = fb;
-                sA[threadIdx.x].x           = temp.x;
-                sA[threadIdx.x].y           = temp.y;
-                sA[threadIdx.x].z           = temp.z;
-                sA[threadIdx.x].r           = temp1.x;
-                sA[threadIdx.x].sr          = temp1.y;
-            }
-#ifdef USE_CUTOFF
-            unsigned int flags              = cSim.pInteractionFlag[pos];
-            if (flags == 0)
-            {
-                // No interactions in this block.
-            }
-            //else if (flags)
-            else if (flags == 0xFFFFFFFF)
-#endif
-            {
-                // Compute all interactions within this block.
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float dx                = psA[tj].x - apos.x;
-                    float dy                = psA[tj].y - apos.y;
-                    float dz                = psA[tj].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx                     -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy                     -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz                     -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    float r2                = dx * dx + dy * dy + dz * dz;
-                    float r                 = sqrtf(r2);
-                    float dE                = getGBVI_dE2( r, ar.x, psA[tj].sr, fb );
-#if defined USE_CUTOFF
-                    if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                    if (i >= cSim.atoms || y+tj >= cSim.atoms )
-#endif
-                    {
-                        dE                  = 0.0f;
-                    }
-                    float d                 = dx * dE;
-                    af.x                   -= d;
-                    psA[tj].fx             += d;
-                    d                       = dy * dE;
-                    af.y                   -= d;
-                    psA[tj].fy             += d;
-                    d                       = dz * dE;
-                    af.z                   -= d;
-                    psA[tj].fz             += d;
-                    // Atom J Born sum term
-                    dE                      = getGBVI_dE2( r, psA[tj].r, ar.y, psA[tj].fb );
-#if defined USE_CUTOFF
-                    if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                    if (i >= cSim.atoms || y+tj >= cSim.atoms )
-#endif
-                    { 
-                        dE                  = 0.0f;
-                    }
-                    dx                     *= dE;
-                    dy                     *= dE;
-                    dz                     *= dE;
-                    psA[tj].fx             += dx;
-                    psA[tj].fy             += dy;
-                    psA[tj].fz             += dz;
-                    af.x                   -= dx;
-                    af.y                   -= dy;
-                    af.z                   -= dz;
-                    tj                      = (tj + 1) & (GRID - 1);
-                }
-            }
-#ifdef USE_CUTOFF
-            else
-            {
-                // Compute only a subset of the interactions in this block.
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    if ((flags&(1<<j)) != 0)
-                    {
-                        float dx                = psA[j].x - apos.x;
-                        float dy                = psA[j].y - apos.y;
-                        float dz                = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                        dx                     -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                        dy                     -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                        dz                     -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                        float r2                = dx * dx + dy * dy + dz * dz;
-                        float r                 = sqrtf(r2);
-                        // Interleaved Atom I and J Born Forces and sum components
-                        float dE                = getGBVI_dE2( r, ar.x, psA[j].sr, fb );
-#if defined USE_CUTOFF
-                        if (i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                        if (i >= cSim.atoms || y+j >= cSim.atoms )
-#endif
-                        {
-                            dE                  = 0.0f;
-                        }
-                        float d                 = dx * dE;
-                        af.x                   -= d;
-                        tempBuffer[threadIdx.x].x = d;
-                        d                       = dy * dE;
-                        af.y                   -= d;
-                        tempBuffer[threadIdx.x].y = d;
-                        d                       = dz * dE;
-                        af.z                   -= d;
-                        tempBuffer[threadIdx.x].z = d;
-                        // Atom J Born sum term
-                        dE                      = getGBVI_dE2( r, psA[j].r, ar.y, psA[j].fb );
-#if defined USE_CUTOFF
-                        if (i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                        if (i >= cSim.atoms || y+j >= cSim.atoms )
-#endif
-                        {
-                            dE                  = 0.0f;
-                        }
-                        dx                     *= dE;
-                        dy                     *= dE;
-                        dz                     *= dE;
-                        tempBuffer[threadIdx.x].x += dx;
-                        tempBuffer[threadIdx.x].y += dy;
-                        tempBuffer[threadIdx.x].z += dz;
-                        af.x                   -= dx;
-                        af.y                   -= dy;
-                        af.z                   -= dz;
-                        // Sum the forces on atom j.
-                        if (tgx % 2 == 0)
-                        {
-                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
-                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
-                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
-                        }
-                        if (tgx % 4 == 0)
-                        {
-                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
-                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
-                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
-                        }
-                        if (tgx % 8 == 0)
-                        {
-                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
-                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
-                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
-                        }
-                        if (tgx % 16 == 0)
-                        {
-                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
-                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
-                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
-                        }
-                        if (tgx == 0)
-                        {
-                            psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
-                            psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
-                            psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
-                        }
-                    }
-                }
-            }
-#endif
-            // Write results
-            float4 of;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset         = x + tgx + warp*cSim.stride;
-#else
-            unsigned int offset         = x + tgx + (y >> GRIDBITS) * cSim.stride;
-#endif
-            of                          = cSim.pForce4[offset];
-            of.x                       += af.x;
-            of.y                       += af.y;
-            of.z                       += af.z;
-            cSim.pForce4[offset]       = of;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            offset                      = y + tgx + warp*cSim.stride;
-#else
-            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
-#endif
-            of                          = cSim.pForce4[offset];
-            of.x                       += sA[threadIdx.x].fx;
-            of.y                       += sA[threadIdx.x].fy;
-            of.z                       += sA[threadIdx.x].fz;
-            cSim.pForce4[offset]       = of;
-        }
-        lasty = y;
-        pos++;
-    }
-}
--- a/platforms/cuda-old/src/kernels/kCalculateLocalForces.cu
+++ b/platforms/cuda-old/src/kernels/kCalculateLocalForces.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-#include "gputypes.h"
-extern __shared__ Vectors sV[];
-static __constant__ cudaGmxSimulation cSim;
-/* Cuda compiler on Windows does not recognized "static const float" values */
-#define LOCAL_HACK_PI 3.1415926535897932384626433832795
-#define DOT3(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z)
-#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
-{ \
-    dp          = DOT3(v1, v2); \
-    float norm1 = DOT3(v1, v1); \
-    float norm2 = DOT3(v2, v2); \
-    dp /= sqrtf(norm1 * norm2); \
-    dp = min(dp, 1.0f); \
-    dp = max(dp, -1.0f); \
-}
-#define CROSS_PRODUCT(v1, v2, c) \
-    c.x = v1.y * v2.z - v1.z * v2.y; \
-    c.y = v1.z * v2.x - v1.x * v2.z; \
-    c.z = v1.x * v2.y - v1.y * v2.x;
-#define GETPREFACTORSGIVENANGLECOSINE(cosine, param, dEdR) \
-{ \
-   float angle          = acosf(cosine); \
-   float deltaIdeal     = angle - (param.x * (LOCAL_HACK_PI / 180.0f)); \
-   dEdR                 = param.y * deltaIdeal; \
-}
-#define GETENERGYGIVENANGLECOSINE(cosine, param, dEdR) \
-{ \
-   float angle          = acosf(cosine); \
-   float deltaIdeal     = angle - (param.x * (LOCAL_HACK_PI / 180.0f)); \
-   dEdR                 = param.y * deltaIdeal * deltaIdeal; \
-}
-#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
-{ \
-    float dp; \
-    GETNORMEDDOTPRODUCT(v1, v2, dp); \
-    if (dp > 0.99f || dp < -0.99f) { \
-        float4 cross; \
-        CROSS_PRODUCT(v1, v2, cross); \
-        float scale = DOT3(v1, v1)*DOT3(v2, v2); \
-        angle = asinf(sqrtf(DOT3(cross, cross)/scale)); \
-        if (dp < 0.0f) \
-            angle = LOCAL_HACK_PI-angle; \
-    } \
-    else { \
-        angle = acosf(dp); \
-    } \
-}
-#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
-{ \
-    CROSS_PRODUCT(vector1, vector2, cp0); \
-    CROSS_PRODUCT(vector2, vector3, cp1); \
-    GETANGLEBETWEENTWOVECTORS(cp0, cp1, angle); \
-    float dp = DOT3(signVector, cp1); \
-    angle = (dp >= 0) ? angle : -angle; \
-}                                                          
-#define GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle, cosine) \
-{ \
-    CROSS_PRODUCT(vector1, vector2, cp0); \
-    CROSS_PRODUCT(vector2, vector3, cp1); \
-    GETANGLEBETWEENTWOVECTORS(cp0, cp1, angle); \
-    float dp = DOT3(signVector, cp1); \
-    angle = (dp >= 0) ? angle : -angle; \
-    cosine = cosf(angle); \
-}
-void SetCalculateLocalForcesSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-void GetCalculateLocalForcesSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_LOCALFORCES_THREADS_PER_BLOCK, 1)
-#endif
-void kCalculateLocalForces_kernel()
-{
-    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    Vectors* A = &sV[threadIdx.x];
-    float energy = 0.0f;
-    while (pos < cSim.bond_offset)
-    {
-        if (pos < cSim.bonds)
-        {
-            int4   atom         = cSim.pBondID[pos];
-            float4 atomA        = cSim.pPosq[atom.x];
-            float4 atomB        = cSim.pPosq[atom.y];
-            float2 bond         = cSim.pBondParameter[pos];
-            float dx            = atomB.x - atomA.x;
-            float dy            = atomB.y - atomA.y;
-            float dz            = atomB.z - atomA.z;
-            float r2            = dx * dx + dy * dy + dz * dz;
-            float r             = sqrtf(r2);
-            float deltaIdeal    = r - bond.x;
-/* E */     energy             += 0.5f * bond.y * deltaIdeal * deltaIdeal;
-            float dEdR          = bond.y * deltaIdeal;
-            dEdR                = (r > 0.0f) ? (dEdR / r) : 0.0f;
-//            printf("D: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f\n", dx, dy, dz, r, deltaIdeal, dEdR);
-            dx                 *= dEdR;
-            dy                 *= dEdR;
-            dz                 *= dEdR;
-            unsigned int offsetA                = atom.x + atom.z * cSim.stride;
-            unsigned int offsetB                = atom.y + atom.w * cSim.stride;
-            float4 forceA                       = cSim.pForce4[offsetA];
-            float4 forceB                       = cSim.pForce4[offsetB];
-            forceA.x                           += dx;
-            forceA.y                           += dy;
-            forceA.z                           += dz;
-            forceB.x                           -= dx;
-            forceB.y                           -= dy;
-            forceB.z                           -= dz;
-            cSim.pForce4[offsetA]               = forceA;
-            cSim.pForce4[offsetB]               = forceB;    
-        }
-        pos += blockDim.x * gridDim.x;
-    }
-    while (pos < cSim.bond_angle_offset)
-    {
-        unsigned int pos1   = pos - cSim.bond_offset;
-        if (pos1 < cSim.bond_angles)
-        {
-            int4   atom1            = cSim.pBondAngleID1[pos1];  
-            float2 bond_angle       = cSim.pBondAngleParameter[pos1];
-            float4 a1               = cSim.pPosq[atom1.x];
-            float4 a2               = cSim.pPosq[atom1.y];
-            float4 a3               = cSim.pPosq[atom1.z];
-            A->v0.x                 = a2.x - a1.x;
-            A->v0.y                 = a2.y - a1.y;
-            A->v0.z                 = a2.z - a1.z;
-            A->v1.x                 = a2.x - a3.x;
-            A->v1.y                 = a2.y - a3.y;
-            A->v1.z                 = a2.z - a3.z;
-            float3 cp;
-            CROSS_PRODUCT(A->v0, A->v1, cp);
-            float rp                = DOT3(cp, cp); //cx * cx + cy * cy + cz * cz;
-            rp                      = max(sqrtf(rp), 1.0e-06f);
-            float r21               = DOT3(A->v0, A->v0); // dx1 * dx1 + dy1 * dy1 + dz1 * dz1;
-            float r23               = DOT3(A->v1, A->v1); // dx2 * dx2 + dy2 * dy2 + dz2 * dz2;
-            float dot               = DOT3(A->v0, A->v1); // dx1 * dx2 + dy1 * dy2 + dz1 * dz2;
-            float cosine            = max(-1.0f, min(1.0f, dot / sqrtf(r21 * r23)));
-            float angle_energy;
-/* E */     GETENERGYGIVENANGLECOSINE(cosine, bond_angle, angle_energy);
-            energy                 += 0.5f*angle_energy;
-            float dEdR;
-            GETPREFACTORSGIVENANGLECOSINE(cosine, bond_angle, dEdR);
-            //printf("%11.4f %11.4f\n", cosine, dEdR);
-            float termA             =  dEdR / (r21 * rp);
-            float termC             = -dEdR / (r23 * rp);
-            float3 c21;
-            float3 c23;
-            CROSS_PRODUCT(A->v0, cp, c21);
-            CROSS_PRODUCT(A->v1, cp, c23);
-            c21.x                  *= termA;
-            c21.y                  *= termA;
-            c21.z                  *= termA;
-            c23.x                  *= termC;
-            c23.y                  *= termC;
-            c23.z                  *= termC;
-            int2 atom2              = cSim.pBondAngleID2[pos1];
-            unsigned int offset     = atom1.x + atom1.w * cSim.stride;
-            float4 force            = cSim.pForce4[offset]; 
-            force.x                += c21.x;
-            force.y                += c21.y;
-            force.z                += c21.z;
-            cSim.pForce4[offset]    = force;
-            offset                  = atom1.y + atom2.x * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                -= (c21.x + c23.x);
-            force.y                -= (c21.y + c23.y);
-            force.z                -= (c21.z + c23.z);
-            cSim.pForce4[offset]    = force;
-            offset                  = atom1.z + atom2.y * cSim.stride;
-            force                   = cSim.pForce4[offset];
-            force.x                += c23.x;
-            force.y                += c23.y;
-            force.z                += c23.z;
-            cSim.pForce4[offset]    = force;
-        }
-        pos += blockDim.x * gridDim.x;
-    }
-    while (pos < cSim.dihedral_offset)
-    {
-        unsigned int pos1 = pos - cSim.bond_angle_offset;
-        if (pos1 < cSim.dihedrals)
-        {
-            int4   atom1        = cSim.pDihedralID1[pos1];  
-            float4 atomA        = cSim.pPosq[atom1.x];
-            float4 atomB        = cSim.pPosq[atom1.y];
-            float4 atomC        = cSim.pPosq[atom1.z];
-            float4 atomD        = cSim.pPosq[atom1.w];            
-            A->v0.x             = atomA.x - atomB.x;
-            A->v0.y             = atomA.y - atomB.y;
-            A->v0.z             = atomA.z - atomB.z;
-            A->v1.x             = atomC.x - atomB.x;
-            A->v1.y             = atomC.y - atomB.y;
-            A->v1.z             = atomC.z - atomB.z;
-            A->v2.x             = atomC.x - atomD.x;
-            A->v2.y             = atomC.y - atomD.y;
-            A->v2.z             = atomC.z - atomD.z; 
-            float3 cp0, cp1;
-            float dihedralAngle;
-            GETDIHEDRALANGLEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle);
-            float4 dihedral         = cSim.pDihedralParameter[pos1];
-            float deltaAngle        = dihedral.z * dihedralAngle - (dihedral.y * LOCAL_HACK_PI / 180.0f);
-	    // ATTENTION: This section leads to a divergent deltaAngle values wrt
-	    // forces and energies. We separate the case dihedral.z = n = 0, which
-	    // is treated by the calculation of energies via a harmonic potential
-/* E */     if (dihedral.z) energy += dihedral.x * (1.0f + cosf(deltaAngle));
-/* E */     else
-	    {
-		float deltaAngle    = dihedralAngle - dihedral.y;
-		if (deltaAngle < -LOCAL_HACK_PI) deltaAngle += 2.0f * LOCAL_HACK_PI;
-		else if (deltaAngle > LOCAL_HACK_PI) deltaAngle -= 2.0f * LOCAL_HACK_PI;
-                energy             += dihedral.x * deltaAngle * deltaAngle;
-	    }
-            float sinDeltaAngle     = sinf(deltaAngle);
-            float dEdAngle          = -dihedral.x * dihedral.z * sinDeltaAngle;
-            float normCross1        = DOT3(cp0, cp0);
-            float normBC            = sqrtf(DOT3(A->v1, A->v1));
-            float4 ff;
-            ff.x                    = (-dEdAngle * normBC) / normCross1;
-            float normCross2        = DOT3(cp1, cp1);
-            ff.w                    = (dEdAngle * normBC) / normCross2;
-            float dp                = 1.0f / DOT3(A->v1, A->v1);
-            ff.y                    = DOT3(A->v0, A->v1) * dp;
-            ff.z                    = DOT3(A->v2, A->v1) * dp;
-            int4  atom2             = cSim.pDihedralID2[pos1];   
-            float3 internalF0;
-            float3 internalF3;
-            float3 s;
-//            printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);  
-            unsigned int offset                 = atom1.x + atom2.x * cSim.stride;
-            float4 force                        = cSim.pForce4[offset]; 
-            internalF0.x                        = ff.x * cp0.x; 
-            force.x                            += internalF0.x;
-            internalF0.y                        = ff.x * cp0.y;
-            force.y                            += internalF0.y;
-            internalF0.z                        = ff.x * cp0.z;       
-            force.z                            += internalF0.z;
-            cSim.pForce4[offset]                = force;
-            //printf("%4d - 0: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
-            offset                              = atom1.w + atom2.w * cSim.stride;
-            force                               = cSim.pForce4[offset];
-            internalF3.x                        = ff.w * cp1.x;
-            force.x                            += internalF3.x;
-            internalF3.y                        = ff.w * cp1.y;
-            force.y                            += internalF3.y;
-            internalF3.z                        = ff.w * cp1.z;
-            force.z                            += internalF3.z;
-            cSim.pForce4[offset]                = force;
-           // printf("%4d - 3: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
-            s.x                                 = ff.y * internalF0.x - ff.z * internalF3.x;   
-            s.y                                 = ff.y * internalF0.y - ff.z * internalF3.y;  
-            s.z                                 = ff.y * internalF0.z - ff.z * internalF3.z;        
-            offset                              = atom1.y + atom2.y * cSim.stride;
-            force                               = cSim.pForce4[offset];
-            force.x                            += -internalF0.x + s.x;
-            force.y                            += -internalF0.y + s.y;
-            force.z                            += -internalF0.z + s.z;
-            cSim.pForce4[offset]                = force;
-            //printf("%4d - 1: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
-            offset                              = atom1.z + atom2.z * cSim.stride;
-            force                               = cSim.pForce4[offset];
-            force.x                            += -internalF3.x - s.x;
-            force.y                            += -internalF3.y - s.y;
-            force.z                            += -internalF3.z - s.z;
-            cSim.pForce4[offset]                = force;
-            //printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
-        }
-        pos += blockDim.x * gridDim.x;
-    }
-    // Ryckaert Bellemans dihedrals
-    while (pos < cSim.rb_dihedral_offset)
-    {
-        unsigned int pos1 = pos - cSim.dihedral_offset;
-        if (pos1 < cSim.rb_dihedrals)
-        {
-            int4   atom1        = cSim.pRbDihedralID1[pos1];
-            float4 atomA        = cSim.pPosq[atom1.x];
-            float4 atomB        = cSim.pPosq[atom1.y];
-            float4 atomC        = cSim.pPosq[atom1.z];
-            float4 atomD        = cSim.pPosq[atom1.w];
-            A->v0.x             = atomA.x - atomB.x;
-            A->v0.y             = atomA.y - atomB.y;
-            A->v0.z             = atomA.z - atomB.z;
-            A->v1.x             = atomC.x - atomB.x;
-            A->v1.y             = atomC.y - atomB.y;
-            A->v1.z             = atomC.z - atomB.z;
-            A->v2.x             = atomC.x - atomD.x;
-            A->v2.y             = atomC.y - atomD.y;
-            A->v2.z             = atomC.z - atomD.z;
-            float3 cp0, cp1;
-            float dihedralAngle, cosPhi;
-      //      printf("%4d - 0 : %9.4f %9.4f %9.4f\n", pos1, A->v0.x, A->v0.y, A->v0.z);
-      //      printf("%4d - 1 : %9.4f %9.4f %9.4f\n", pos1, A->v1.x, A->v1.y, A->v1.z);
-      //      printf("%4d - 2 : %9.4f %9.4f %9.4f\n", pos1, A->v2.x, A->v2.y, A->v2.z);
-            GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle, cosPhi);
-            if (dihedralAngle < 0.0f )
-            {
-                dihedralAngle += LOCAL_HACK_PI;
-            }
-            else
-            {
-                dihedralAngle -= LOCAL_HACK_PI;
-            }
-            cosPhi                  = -cosPhi;
-         //   printf("%4d: %9.4f %9.4f\n", pos1, dihedralAngle, cosPhi);
-            float4 dihedral1        = cSim.pRbDihedralParameter1[pos1];
-            float2 dihedral2        = cSim.pRbDihedralParameter2[pos1];
-            float cosFactor         = cosPhi;
-            float dEdAngle          = -dihedral1.y;
-/* E */     float rb_energy         = dihedral1.x;
-            rb_energy              += dihedral1.y * cosFactor;
-        //    printf("%4d - 1: %9.4f %9.4f\n", pos1, dEdAngle, 1.0f);
-            dEdAngle               -= 2.0f * dihedral1.z * cosFactor;
-       //     printf("%4d - 2: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
-            cosFactor              *= cosPhi;
-            dEdAngle               -= 3.0f * dihedral1.w * cosFactor;
-            rb_energy              += dihedral1.z * cosFactor;
-    //       printf("%4d - 3: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
-            cosFactor              *= cosPhi;
-            dEdAngle               -= 4.0f * dihedral2.x * cosFactor;
-            rb_energy              += dihedral1.w * cosFactor;
-  //         printf("%4d - 4: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
-            cosFactor              *= cosPhi;
-            dEdAngle               -= 5.0f * dihedral2.y * cosFactor;
-            rb_energy              += dihedral2.x * cosFactor;
-            rb_energy              += dihedral2.y * cosFactor * cosPhi;
-/* E */     energy                 += rb_energy;
- //           printf("%4d - 5: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
-            dEdAngle               *= sinf(dihedralAngle);
-//            printf("%4d - f: %9.4f\n", pos1, dEdAngle);
-            float normCross1        = DOT3(cp0, cp0);
-            float normBC            = sqrtf(DOT3(A->v1, A->v1));
-            float4 ff;
-            ff.x                    = (-dEdAngle * normBC) / normCross1;
-            float normCross2        = DOT3(cp1, cp1);
-            ff.w                    = (dEdAngle * normBC) / normCross2;
-            float dp                = 1.0f / DOT3(A->v1, A->v1);
-            ff.y                    = DOT3(A->v0, A->v1) * dp;
-            ff.z                    = DOT3(A->v2, A->v1) * dp;
-            int4  atom2             = cSim.pRbDihedralID2[pos1];
-            float3 internalF0;
-            float3 internalF3;
-            float3 s;
-//            printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
-            unsigned int offset                 = atom1.x + atom2.x * cSim.stride;
-            float4 force                        = cSim.pForce4[offset];
-            internalF0.x                        = ff.x * cp0.x;
-            force.x                            += internalF0.x;
-            internalF0.y                        = ff.x * cp0.y;
-            force.y                            += internalF0.y;
-            internalF0.z                        = ff.x * cp0.z;
-            force.z                            += internalF0.z;
-            cSim.pForce4[offset]                = force;
- //           printf("%4d - 0: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
-            offset                              = atom1.w + atom2.w * cSim.stride;
-            force                               = cSim.pForce4[offset];
-            internalF3.x                        = ff.w * cp1.x;
-            force.x                            += internalF3.x;
-            internalF3.y                        = ff.w * cp1.y;
-            force.y                            += internalF3.y;
-            internalF3.z                        = ff.w * cp1.z;
-            force.z                            += internalF3.z;
-            cSim.pForce4[offset]                = force;
-   //         printf("%4d - 3: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
-            s.x                                 = ff.y * internalF0.x - ff.z * internalF3.x;
-            s.y                                 = ff.y * internalF0.y - ff.z * internalF3.y;
-            s.z                                 = ff.y * internalF0.z - ff.z * internalF3.z;
-            offset                              = atom1.y + atom2.y * cSim.stride;
-            force                               = cSim.pForce4[offset];
-            force.x                            += -internalF0.x + s.x;
-            force.y                            += -internalF0.y + s.y;
-            force.z                            += -internalF0.z + s.z;
-            cSim.pForce4[offset]                = force;
-     //       printf("%4d - 1: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
-            offset                              = atom1.z + atom2.z * cSim.stride;
-            force                               = cSim.pForce4[offset];
-            force.x                            += -internalF3.x - s.x;
-            force.y                            += -internalF3.y - s.y;
-            force.z                            += -internalF3.z - s.z;
-            cSim.pForce4[offset]                = force;
-     //       printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
-        }         
-        pos += blockDim.x * gridDim.x;
-    }   
-    while (pos < cSim.LJ14_offset)
-    {
-        unsigned int pos1       = pos - cSim.rb_dihedral_offset;
-        if (pos1 < cSim.LJ14s)
-        {
-            int4 atom               = cSim.pLJ14ID[pos1];
-            float4 LJ14             = cSim.pLJ14Parameter[pos1];
-            float4 a1               = cSim.pPosq[atom.x];
-            float4 a2               = cSim.pPosq[atom.y];
-            float3 d;
-            d.x                     = a1.x - a2.x;
-            d.y                     = a1.y - a2.y;
-            d.z                     = a1.z - a2.z;
-            float r2                = DOT3(d, d);
-            float inverseR          = 1.0f / sqrtf(r2);
-            float sig2              = inverseR * LJ14.y;
-            sig2                   *= sig2;
-            float sig6              = sig2 * sig2 * sig2;
-            float dEdR              = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
-            /* E */
-            energy                 += LJ14.x * (sig6 - 1.0f) * sig6;
-            energy                 += LJ14.z * inverseR;
-            dEdR                   += LJ14.z * inverseR;
-            dEdR                   *= inverseR * inverseR;
-            unsigned int offsetA    = atom.x + atom.z * cSim.stride;
-            unsigned int offsetB    = atom.y + atom.w * cSim.stride;
-            float4 forceA           = cSim.pForce4[offsetA];
-            float4 forceB           = cSim.pForce4[offsetB];
-            d.x                    *= dEdR;
-            d.y                    *= dEdR;
-            d.z                    *= dEdR;
-            forceA.x               += d.x;
-            forceA.y               += d.y;
-            forceA.z               += d.z;
-            forceB.x               -= d.x;
-            forceB.y               -= d.y;
-            forceB.z               -= d.z;
-            cSim.pForce4[offsetA]   = forceA;
-            cSim.pForce4[offsetB]   = forceB;
-        }
-        pos                    += blockDim.x * gridDim.x;
-    }
-    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy;
-}
-void kCalculateLocalForces(gpuContext gpu)
-{
-  //  printf("kCalculateLocalForces\n");
-    kCalculateLocalForces_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block, gpu->sim.localForces_threads_per_block * sizeof(Vectors)>>>();
-    LAUNCHERROR("kCalculateLocalForces");
-}
--- a/platforms/cuda-old/src/kernels/kCalculateObcGbsaBornSum.cu
+++ b/platforms/cuda-old/src/kernels/kCalculateObcGbsaBornSum.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-#include "gputypes.h"
-struct Atom {
-    float x;
-    float y;
-    float z;
-    float r;
-    float sr;
-    float sum;
-    float padding;
-};
-static __constant__ cudaGmxSimulation cSim;
-void SetCalculateObcGbsaBornSumSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-void GetCalculateObcGbsaBornSumSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-// Include versions of the kernels for N^2 calculations.
-#define METHOD_NAME(a, b) a##N2##b
-#include "kCalculateObcGbsaBornSum.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateObcGbsaBornSum.h"
-// Include versions of the kernels with cutoffs.
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_CUTOFF
-#define METHOD_NAME(a, b) a##Cutoff##b
-#include "kCalculateObcGbsaBornSum.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##CutoffByWarp##b
-#include "kCalculateObcGbsaBornSum.h"
-// Include versions of the kernels with periodic boundary conditions.
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_PERIODIC
-#define METHOD_NAME(a, b) a##Periodic##b
-#include "kCalculateObcGbsaBornSum.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##PeriodicByWarp##b
-#include "kCalculateObcGbsaBornSum.h"
-__global__ 
-__launch_bounds__(384, 1)
-void kReduceObcGbsaBornSum_kernel()
-{
-    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
-    while (pos < cSim.atoms)
-    {
-        float sum   = 0.0f;
-        float* pSt  = cSim.pBornSum + pos;
-        float2 atom = cSim.pObcData[pos];
-        // Get summed Born data
-        for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
-        {
-            sum += *pSt;
-            pSt += cSim.stride;
-        }
-        // Now calculate Born radius and OBC term.
-        sum                    *= 0.5f * atom.x;
-        float sum2              = sum * sum;
-        float sum3              = sum * sum2;
-        float tanhSum           = tanh(cSim.alphaOBC * sum - cSim.betaOBC * sum2 + cSim.gammaOBC * sum3);
-        float nonOffsetRadii    = atom.x + cSim.dielectricOffset;
-        float bornRadius        = 1.0f / (1.0f / atom.x - tanhSum / nonOffsetRadii); 
-        float obcChain          = atom.x * (cSim.alphaOBC - 2.0f * cSim.betaOBC * sum + 3.0f * cSim.gammaOBC * sum2);
-        obcChain                = (1.0f - tanhSum * tanhSum) * obcChain / nonOffsetRadii;        
-        cSim.pBornRadii[pos]    = bornRadius;
-        cSim.pObcChain[pos]     = obcChain;
-        pos                    += gridDim.x * blockDim.x;
-    }   
-}
-void OPENMMCUDA_EXPORT kReduceObcGbsaBornSum(gpuContext gpu)
-{
-    kReduceObcGbsaBornSum_kernel<<<gpu->sim.blocks, 384>>>();
-    gpu->bRecalculateBornRadii = false;
-    LAUNCHERROR("kReduceObcGbsaBornSum");
-}
-void kPrintObc( gpuContext gpu, std::string callId, int call, FILE* log)
-{
-    gpu->psObcData->Download();
-    gpu->psBornRadii->Download();
-    gpu->psObcChain->Download();
-    gpu->psBornForce->Download();
-    gpu->psPosq4->Download();
-    gpu->psSigEps2->Download();
-    (void) fprintf( log, "kPrintObc Cuda bCh bR bF prm[2]   sigeps[2]\n" );
-    (void) fprintf( stderr, "bOutputWarp=%u blks=%u th/blk=%u wu=%u %u shrd=%u\n", gpu->bOutputBufferPerWarp,
-                    gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, gpu->sim.workUnits, gpu->psWorkUnit->_pSysStream[0][0],
-                    static_cast<unsigned int>(sizeof(Atom)*gpu->sim.nonbond_threads_per_block) );
-    for( unsigned int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
-        (void) fprintf( log, "%6d %15.7e %15.7e %15.7e    %15.7e %15.7e   %15.7e %15.7e \n", ii, 
-                        gpu->psObcChain->_pSysData[ii],
-                        gpu->psBornRadii->_pSysData[ii],
-                        gpu->psBornForce->_pSysData[ii],
-                        gpu->psObcData->_pSysData[ii].x,
-                        gpu->psObcData->_pSysData[ii].y,
-                        gpu->psSigEps2->_pSysData[ii].x,
-                        gpu->psSigEps2->_pSysData[ii].y );
-    }   
-}
-void OPENMMCUDA_EXPORT kCalculateObcGbsaBornSum(gpuContext gpu)
-{
-  //  printf("kCalculateObcgbsaBornSum\n");
-    switch (gpu->sim.nonbondedMethod)
-    {
-        case NO_CUTOFF:
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateObcGbsaN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
-            else
-                kCalculateObcGbsaN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
-            break;
-        case CUTOFF:
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateObcGbsaCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateObcGbsaCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            break;
-        case PERIODIC:
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateObcGbsaPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateObcGbsaPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            break;
-    }
-    LAUNCHERROR("kCalculateBornSum");
-}
--- a/platforms/cuda-old/src/kernels/kCalculateObcGbsaBornSum.h
+++ b/platforms/cuda-old/src/kernels/kCalculateObcGbsaBornSum.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-/**
- * This file contains the kernel for calculating Born sums.  It is included
- * several times in kCalculateObcGbsaBornSum.cu with different #defines to generate
- * different versions of the kernels.
- */
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
-#endif
-void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
-{
-    extern __shared__ volatile Atom sA[];
-    unsigned int totalWarps   = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
-    unsigned int warp         = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    unsigned int numWorkUnits = cSim.pInteractionCount[0];
-    unsigned int pos          = warp*numWorkUnits/totalWarps;
-    unsigned int end          = (warp+1)*numWorkUnits/totalWarps;
-#ifdef USE_CUTOFF
-    volatile float* tempBuffer = (volatile float*) &sA[cSim.nonbond_threads_per_block];
-#endif
-    while (pos < end)
-    {
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x = workUnit[pos];
-        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
-        x = (x >> 17) << GRIDBITS;
-        float       dx;
-        float       dy;
-        float       dz;
-        float       r2;
-        float       r;
-        unsigned int tgx = threadIdx.x & (GRID - 1);
-        unsigned int tbx = threadIdx.x - tgx;
-        unsigned int tj  = tgx;
-        volatile Atom* psA = &sA[tbx];
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        {
-            // Read fixed atom data into registers and GRF
-            unsigned int i = x + tgx;
-            float4 apos = cSim.pPosq[i];    // Local atom x, y, z, sum
-            float2 ar = cSim.pObcData[i];   // Local atom vr, sr
-            sA[threadIdx.x].x           = apos.x;
-            sA[threadIdx.x].y           = apos.y;
-            sA[threadIdx.x].z           = apos.z;
-            sA[threadIdx.x].r           = ar.x;
-            sA[threadIdx.x].sr          = ar.y;
-            apos.w                      = 0.0f;
-            for (unsigned int j = 0; j < GRID; j++)
-            {
-                dx                      = psA[j].x - apos.x;
-                dy                      = psA[j].y - apos.y;
-                dz                      = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                dx                     -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                dy                     -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                dz                     -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                r2                      = dx * dx + dy * dy + dz * dz;
-#if defined USE_CUTOFF
-                if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#else
-                if (i < cSim.atoms && x+j < cSim.atoms )
-#endif
-                {
-                    r                       = sqrtf(r2);
-                    float rInverse          = 1.0f / r;
-                    float rScaledRadiusJ    = r + psA[j].sr;
-                    if ((j != tgx) && (ar.x < rScaledRadiusJ))
-                    {
-                        float l_ij     = 1.0f / max(ar.x, fabs(r - psA[j].sr));
-                        float u_ij     = 1.0f / rScaledRadiusJ;
-                        float l_ij2    = l_ij * l_ij;
-                        float u_ij2    = u_ij * u_ij;
-                        float ratio    = logf(u_ij / l_ij);
-                        apos.w        += l_ij -
-                                         u_ij +
-                                         0.25f * r * (u_ij2 - l_ij2) +
-                                         (0.50f * rInverse * ratio) +
-                                         (0.25f * psA[j].sr * psA[j].sr * rInverse) *
-                                         (l_ij2 - u_ij2);
-                        float rj = psA[j].sr;
-                        if (ar.x < (rj - r))
-                        {
-                            apos.w += 2.0f * ((1.0f / ar.x) - l_ij);
-                        }
-                    }
-                }
-            }
-            // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset = x + tgx + warp*cSim.stride;
-            cSim.pBornSum[offset] += apos.w;
-#else
-            unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = apos.w;
-#endif
-        }
-        else        // 100% utilization
-        {
-            // Read fixed atom data into registers and GRF
-            unsigned int j                           = y + tgx;
-            unsigned int i                  = x + tgx;
-            float4 temp                     = cSim.pPosq[j];
-            float2 temp1                    = cSim.pObcData[j];
-            float4 apos                     = cSim.pPosq[i];        // Local atom x, y, z, sum
-            float2 ar                       = cSim.pObcData[i];    // Local atom vr, sr
-            sA[threadIdx.x].x               = temp.x;
-            sA[threadIdx.x].y               = temp.y;
-            sA[threadIdx.x].z               = temp.z;
-            sA[threadIdx.x].r               = temp1.x;
-            sA[threadIdx.x].sr              = temp1.y;
-            sA[threadIdx.x].sum = apos.w    = 0.0f;
-#ifdef USE_CUTOFF
-            unsigned int flags = cSim.pInteractionFlag[pos];
-            if (flags == 0)
-            {
-                // No interactions in this block.
-            }
-            else if (flags == 0xFFFFFFFF)
-#endif
-            {
-                // Compute all interactions within this block.
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    dx                      = psA[tj].x - apos.x;
-                    dy                      = psA[tj].y - apos.y;
-                    dz                      = psA[tj].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx                     -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy                     -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz                     -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    r2                      = dx * dx + dy * dy + dz * dz;
-#ifdef USE_CUTOFF
-                    if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#else
-                    if (i < cSim.atoms && y+tj < cSim.atoms)
-#endif
-                    {
-                        r                       = sqrtf(r2);
-                        float rInverse          = 1.0f / r;
-                        float rScaledRadiusJ    = r + psA[tj].sr;
-                        if (ar.x < rScaledRadiusJ)
-                        {
-                            float l_ij     = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
-                            float u_ij     = 1.0f / rScaledRadiusJ;
-                            float l_ij2    = l_ij * l_ij;
-                            float u_ij2    = u_ij * u_ij;
-                            float ratio    = logf(u_ij / l_ij);
-                            float term     = l_ij -
-                                             u_ij +
-                                             0.25f * r * (u_ij2 - l_ij2) +
-                                             (0.50f * rInverse * ratio) +
-                                             (0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
-                                             (l_ij2 - u_ij2);
-                            float srj = psA[tj].sr;
-                            if (ar.x < (srj - r))
-                            {
-                                term += 2.0f * ((1.0f / ar.x) - l_ij);
-                            }
-                            apos.w        += term;
-                        }
-                        float rScaledRadiusI    = r + ar.y;
-                        if (psA[tj].r < rScaledRadiusI)
-                        {
-                            float l_ij     = 1.0f / max(psA[tj].r, fabs(r - ar.y));
-                            float u_ij     = 1.0f / rScaledRadiusI;
-                            float l_ij2    = l_ij * l_ij;
-                            float u_ij2    = u_ij * u_ij;
-                            float ratio    = logf(u_ij / l_ij);
-                            float term     = l_ij -
-                                             u_ij +
-                                             0.25f * r * (u_ij2 - l_ij2) +
-                                             (0.50f * rInverse * ratio) +
-                                             (0.25f * ar.y * ar.y * rInverse) *
-                                             (l_ij2 - u_ij2);
-                            float rj = psA[tj].r;
-                            if (rj < (ar.y - r))
-                            {
-                                term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
-                            }
-                            psA[tj].sum    += term;
-                        }
-                    }
-                    tj = (tj - 1) & (GRID - 1);
-                }
-            }
-#ifdef USE_CUTOFF
-            else
-            {
-                // Compute only a subset of the interactions in this block.
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    if ((flags&(1<<j)) != 0)
-                    {
-                        tempBuffer[threadIdx.x] = 0.0f;
-                        dx                      = psA[j].x - apos.x;
-                        dy                      = psA[j].y - apos.y;
-                        dz                      = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                        dx                     -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                        dy                     -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                        dz                     -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                        r2                      = dx * dx + dy * dy + dz * dz;
-#ifdef USE_CUTOFF
-                        if (i < cSim.atoms && y+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#else
-                        if (i < cSim.atoms && y+j < cSim.atoms)
-#endif
-                        {
-                            r                       = sqrtf(r2);
-                            float rInverse          = 1.0f / r;
-                            float rScaledRadiusJ    = r + psA[j].sr;
-                            if (ar.x < rScaledRadiusJ)
-                            {
-                                float l_ij     = 1.0f / max(ar.x, fabs(r - psA[j].sr));
-                                float u_ij     = 1.0f / rScaledRadiusJ;
-                                float l_ij2    = l_ij * l_ij;
-                                float u_ij2    = u_ij * u_ij;
-                                float ratio    = logf(u_ij / l_ij);
-                                float term     = l_ij -
-                                                 u_ij +
-                                                 0.25f * r * (u_ij2 - l_ij2) +
-                                                 (0.50f * rInverse * ratio) +
-                                                 (0.25f * psA[j].sr * psA[j].sr * rInverse) *
-                                                 (l_ij2 - u_ij2);
-                                float srj = psA[j].sr;
-                                if (ar.x < (srj - r))
-                                {
-                                    term += 2.0f * ((1.0f / ar.x) - l_ij);
-                                }
-                                apos.w        += term;
-                            }
-                            float rScaledRadiusI    = r + ar.y;
-                            if (psA[j].r < rScaledRadiusI)
-                            {
-                                float l_ij     = 1.0f / max(psA[j].r, fabs(r - ar.y));
-                                float u_ij     = 1.0f / rScaledRadiusI;
-                                float l_ij2    = l_ij * l_ij;
-                                float u_ij2    = u_ij * u_ij;
-                                float ratio    = logf(u_ij / l_ij);
-                                float term     = l_ij -
-                                                 u_ij +
-                                                 0.25f * r * (u_ij2 - l_ij2) +
-                                                 (0.50f * rInverse * ratio) +
-                                                 (0.25f * ar.y * ar.y * rInverse) *
-                                                 (l_ij2 - u_ij2);
-                                float rj = psA[j].r;
-                                if (rj < (ar.y - r))
-                                {
-                                    term += 2.0f * ((1.0f / psA[j].r) - l_ij);
-                                }
-                                tempBuffer[threadIdx.x] = term;
-                            }
-                        }
-                        // Sum the terms.
-                        if (tgx % 2 == 0)
-                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
-                        if (tgx % 4 == 0)
-                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
-                        if (tgx % 8 == 0)
-                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
-                        if (tgx % 16 == 0)
-                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
-                        if (tgx == 0)
-                            psA[j].sum += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
-                    }
-                }
-            }
-#endif
-            // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset = x + tgx + warp*cSim.stride;
-            cSim.pBornSum[offset] += apos.w;
-            offset = y + tgx + warp*cSim.stride;
-            cSim.pBornSum[offset] += sA[threadIdx.x].sum;
-#else
-            unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = apos.w;
-            offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = sA[threadIdx.x].sum;
-#endif
-        }
-        pos++;
-    }
-}
--- a/platforms/cuda-old/src/kernels/kCalculateObcGbsaForces2.cu
+++ b/platforms/cuda-old/src/kernels/kCalculateObcGbsaForces2.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-#include "gputypes.h"
-#include "cudaKernels.h"
-struct Atom {
-    float x;
-    float y;
-    float z;
-    float r;
-    float sr;
-    float fx;
-    float fy;
-    float fz;
-    float fb;
-};
-static __constant__ cudaGmxSimulation cSim;
-void SetCalculateObcGbsaForces2Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-void GetCalculateObcGbsaForces2Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-// Include versions of the kernels for N^2 calculations.
-#define METHOD_NAME(a, b) a##N2##b
-#include "kCalculateObcGbsaForces2.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateObcGbsaForces2.h"
-// Include versions of the kernels with cutoffs.
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_CUTOFF
-#define METHOD_NAME(a, b) a##Cutoff##b
-#include "kCalculateObcGbsaForces2.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##CutoffByWarp##b
-#include "kCalculateObcGbsaForces2.h"
-// Include versions of the kernels with periodic boundary conditions.
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_PERIODIC
-#define METHOD_NAME(a, b) a##Periodic##b
-#include "kCalculateObcGbsaForces2.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##PeriodicByWarp##b
-#include "kCalculateObcGbsaForces2.h"
-void OPENMMCUDA_EXPORT kCalculateObcGbsaForces2(gpuContext gpu)
-{
-    //printf("kCalculateObcGbsaForces2\n");
-    switch (gpu->sim.nonbondedMethod)
-    {
-        case NO_CUTOFF:
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateObcGbsaN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
-            else
-                kCalculateObcGbsaN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        sizeof(Atom)*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pWorkUnit);
-            break;
-        case CUTOFF:
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateObcGbsaCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateObcGbsaCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            break;
-        case PERIODIC:
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateObcGbsaPeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateObcGbsaPeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.bornForce2_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            break;
-    }
-    LAUNCHERROR("kCalculateObcGbsaForces2");
-    //kPrintObc( gpu, "Post kCalculateObcGbsaForces2", 0, stderr );
-}
--- a/platforms/cuda-old/src/kernels/kCalculateObcGbsaForces2.h
+++ b/platforms/cuda-old/src/kernels/kCalculateObcGbsaForces2.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-/**
- * This file contains the kernel for evalauating the second stage of GBSA.  It is included
- * several times in kCalculateObcGbsaForces2.cu with different #defines to generate
- * different versions of the kernels.
- */
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1)
-#endif
-void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit)
-{
-    extern __shared__ volatile Atom sA[];
-    unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    unsigned int numWorkUnits = cSim.pInteractionCount[0];
-    unsigned int pos = warp*numWorkUnits/totalWarps;
-    unsigned int end = (warp+1)*numWorkUnits/totalWarps;
-#ifdef USE_CUTOFF
-    volatile float3* tempBuffer = (volatile float3*) &sA[cSim.bornForce2_threads_per_block];
-#endif
-    unsigned int lasty = -0xFFFFFFFF;
-    while (pos < end)
-    {
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x                  = workUnit[pos];
-        unsigned int y                  = ((x >> 2) & 0x7fff) << GRIDBITS;
-        x                               = (x >> 17) << GRIDBITS;
-        unsigned int tgx                = threadIdx.x & (GRID - 1);
-        unsigned int i                  = x + tgx;
-        float4 apos                     = cSim.pPosq[i];
-        float2 a                        = cSim.pObcData[i];
-        float fb                        = cSim.pBornForce[i];
-        unsigned int tbx                = threadIdx.x - tgx;
-        unsigned int tj                 = tgx;
-        volatile Atom* psA              = &sA[tbx];
-        float3 af;
-        sA[threadIdx.x].fx = af.x   = 0.0f;
-        sA[threadIdx.x].fy = af.y   = 0.0f;
-        sA[threadIdx.x].fz = af.z   = 0.0f;
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        {
-            // Read fixed atom data into registers and GRF
-            sA[threadIdx.x].x           = apos.x;
-            sA[threadIdx.x].y           = apos.y;
-            sA[threadIdx.x].z           = apos.z;
-            sA[threadIdx.x].r           = a.x;
-            sA[threadIdx.x].sr          = a.y;
-            sA[threadIdx.x].fb          = fb;
-            for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1))
-            {
-                float dx                = psA[j].x - apos.x;
-                float dy                = psA[j].y - apos.y;
-                float dz                = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                float r2                = dx * dx + dy * dy + dz * dz;
-                float r                 = sqrtf(r2);
-                // Atom I Born forces and sum
-                float rScaledRadiusJ    = r + psA[j].sr;
-                float l_ij          = 1.0f / max(a.x, fabs(r - psA[j].sr));
-                float u_ij          = 1.0f / rScaledRadiusJ;
-                float rInverse      = 1.0f / r;
-                float l_ij2         = l_ij * l_ij;
-                float u_ij2         = u_ij * u_ij;
-                float r2Inverse     = rInverse * rInverse;
-                float t1            = logf(u_ij / l_ij);
-                float t2            = (l_ij2 - u_ij2);
-                float t3            = t2 * rInverse;
-                t1                 *= rInverse;
-                // Born Forces term
-                float term          =  0.125f *
-                                      (1.000f + psA[j].sr * psA[j].sr * r2Inverse) * t3 +
-                                       0.250f * t1 * r2Inverse;
-                float dE            = fb * term;
-#if defined USE_PERIODIC
-                if (a.x >= rScaledRadiusJ || i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#elif defined USE_CUTOFF
-                if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
-#else
-                if (a.x >= rScaledRadiusJ || i >= cSim.atoms || x+j >= cSim.atoms )
-#endif
-                {
-                    dE              = 0.0f;
-                }
-                float d             = dx * dE;
-                af.x               -= d;
-                psA[j].fx          += d;
-                d                   = dy * dE;
-                af.y               -= d;
-                psA[j].fy          += d;
-                d                   = dz * dE;
-                af.z               -= d;
-                psA[j].fz          += d;
-            }
-            // Write results
-            float4 of;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset         = x + tgx + warp*cSim.stride;
-#else
-            unsigned int offset         = x + tgx + (x >> GRIDBITS) * cSim.stride;
-#endif
-            of                          = cSim.pForce4[offset];
-            of.x                       += af.x + sA[threadIdx.x].fx;
-            of.y                       += af.y + sA[threadIdx.x].fy;
-            of.z                       += af.z + sA[threadIdx.x].fz;
-            cSim.pForce4[offset]       = of;
-        }
-        else
-        {
-            // Read fixed atom data into registers and GRF
-            if (lasty != y)
-            {
-                unsigned int j              = y + tgx;
-                float4 temp                 = cSim.pPosq[j];
-                float2 temp1                = cSim.pObcData[j];
-                sA[threadIdx.x].fb          = cSim.pBornForce[j];
-                sA[threadIdx.x].x           = temp.x;
-                sA[threadIdx.x].y           = temp.y;
-                sA[threadIdx.x].z           = temp.z;
-                sA[threadIdx.x].r           = temp1.x;
-                sA[threadIdx.x].sr          = temp1.y;
-            }
-            float sr2                   = a.y * a.y;
-#ifdef USE_CUTOFF
-            unsigned int flags = cSim.pInteractionFlag[pos];
-            if (flags == 0)
-            {
-                // No interactions in this block.
-            }
-            else if (flags == 0xFFFFFFFF)
-#endif
-            {
-                // Compute all interactions within this block.
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float dx                = psA[tj].x - apos.x;
-                    float dy                = psA[tj].y - apos.y;
-                    float dz                = psA[tj].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    float r2                = dx * dx + dy * dy + dz * dz;
-                    float r                 = sqrtf(r2);
-                    // Interleaved Atom I and J Born Forces and sum components
-                    float r2Inverse         = 1.0f / r2;
-                    float rScaledRadiusJ    = r + psA[tj].sr;
-                    float rScaledRadiusI    = r + a.y;
-                    float rInverse          = 1.0f / r;
-                    float l_ijJ             = 1.0f / max(a.x, fabs(r - psA[tj].sr));
-                    float l_ijI             = 1.0f / max(psA[tj].r, fabs(r - a.y));
-                    float u_ijJ             = 1.0f / rScaledRadiusJ;
-                    float u_ijI             = 1.0f / rScaledRadiusI;
-                    float l_ij2J            = l_ijJ * l_ijJ;
-                    float l_ij2I            = l_ijI * l_ijI;
-                    float u_ij2J            = u_ijJ * u_ijJ;
-                    float u_ij2I            = u_ijI * u_ijI;
-                    float t1J               = logf(u_ijJ / l_ijJ);
-                    float t1I               = logf(u_ijI / l_ijI);
-                    float t2J               = (l_ij2J - u_ij2J);
-                    float t2I               = (l_ij2I - u_ij2I);
-                    float t3J               = t2J * rInverse;
-                    float t3I               = t2I * rInverse;
-                    t1J                    *= rInverse;
-                    t1I                    *= rInverse;
-                    // Born Forces term
-                    float term              =  0.125f *
-                                              (1.000f + psA[tj].sr * psA[tj].sr * r2Inverse) * t3J +
-                                               0.250f * t1J * r2Inverse;
-                    float dE                = fb * term;
-#if defined USE_PERIODIC
-                    if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#elif defined USE_CUTOFF
-                    if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
-#else
-                    if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+tj >= cSim.atoms )
-#endif
-                    {
-                        dE                  = 0.0f;
-                    }
-                    float d                 = dx * dE;
-                    af.x                   -= d;
-                    psA[tj].fx             += d;
-                    d                       = dy * dE;
-                    af.y                   -= d;
-                    psA[tj].fy             += d;
-                    d                       = dz * dE;
-                    af.z                   -= d;
-                    psA[tj].fz             += d;
-                    // Atom J Born sum term
-                    term                    =  0.125f *
-                                              (1.000f + sr2 * r2Inverse) * t3I +
-                                               0.250f * t1I * r2Inverse;
-                    dE                      = psA[tj].fb * term;
-                    float rj = psA[tj].r;
-#ifdef USE_PERIODIC
-                    if (rj >= rScaledRadiusI || i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#elif defined USE_CUTOFF
-                    if (rj >= rScaledRadiusI || r2 > cSim.nonbondedCutoffSqr)
-#else
-                    if (rj >= rScaledRadiusI || i >= cSim.atoms || y+tj >= cSim.atoms )
-#endif
-                    {
-                        dE                  = 0.0f;
-                    }
-                    dx                     *= dE;
-                    dy                     *= dE;
-                    dz                     *= dE;
-                    psA[tj].fx             += dx;
-                    psA[tj].fy             += dy;
-                    psA[tj].fz             += dz;
-                    af.x                   -= dx;
-                    af.y                   -= dy;
-                    af.z                   -= dz;
-                    tj                      = (tj + 1) & (GRID - 1);
-                }
-            }
-#ifdef USE_CUTOFF
-            else
-            {
-                // Compute only a subset of the interactions in this block.
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    if ((flags&(1<<j)) != 0)
-                    {
-                        float dx                = psA[j].x - apos.x;
-                        float dy                = psA[j].y - apos.y;
-                        float dz                = psA[j].z - apos.z;
-    #ifdef USE_PERIODIC
-                        dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                        dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                        dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-    #endif
-                        float r2                = dx * dx + dy * dy + dz * dz;
-                        float r                 = sqrtf(r2);
-                        // Interleaved Atom I and J Born Forces and sum components
-                        float r2Inverse         = 1.0f / r2;
-                        float rScaledRadiusJ    = r + psA[j].sr;
-                        float rScaledRadiusI    = r + a.y;
-                        float rInverse          = 1.0f / r;
-                        float l_ijJ             = 1.0f / max(a.x, fabs(r - psA[j].sr));
-                        float l_ijI             = 1.0f / max(psA[j].r, fabs(r - a.y));
-                        float u_ijJ             = 1.0f / rScaledRadiusJ;
-                        float u_ijI             = 1.0f / rScaledRadiusI;
-                        float l_ij2J            = l_ijJ * l_ijJ;
-                        float l_ij2I            = l_ijI * l_ijI;
-                        float u_ij2J            = u_ijJ * u_ijJ;
-                        float u_ij2I            = u_ijI * u_ijI;
-                        float t1J               = logf(u_ijJ / l_ijJ);
-                        float t1I               = logf(u_ijI / l_ijI);
-                        float t2J               = (l_ij2J - u_ij2J);
-                        float t2I               = (l_ij2I - u_ij2I);
-                        float t3J               = t2J * rInverse;
-                        float t3I               = t2I * rInverse;
-                        t1J                    *= rInverse;
-                        t1I                    *= rInverse;
-                        // Born Forces term
-                        float term              =  0.125f *
-                                                  (1.000f + psA[j].sr * psA[j].sr * r2Inverse) * t3J +
-                                                   0.250f * t1J * r2Inverse;
-                        float dE                = fb * term;
-    #if defined USE_PERIODIC
-                        if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-    #elif defined USE_CUTOFF
-                        if (a.x >= rScaledRadiusJ || r2 > cSim.nonbondedCutoffSqr)
-    #else
-                        if (a.x >= rScaledRadiusJ || i >= cSim.atoms || y+j >= cSim.atoms )
-    #endif
-                        {
-                            dE                  = 0.0f;
-                        }
-                        float d                 = dx * dE;
-                        af.x                   -= d;
-                        tempBuffer[threadIdx.x].x = d;
-                        d                       = dy * dE;
-                        af.y                   -= d;
-                        tempBuffer[threadIdx.x].y = d;
-                        d                       = dz * dE;
-                        af.z                   -= d;
-                        tempBuffer[threadIdx.x].z = d;
-                        // Atom J Born sum term
-                        term                    =  0.125f *
-                                                  (1.000f + sr2 * r2Inverse) * t3I +
-                                                   0.250f * t1I * r2Inverse;
-                        dE                      = psA[j].fb * term;
-                        float rj = psA[j].r;
-    #ifdef USE_PERIODIC
-                        if (rj >= rScaledRadiusI || i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-    #elif defined USE_CUTOFF
-                        if (rj >= rScaledRadiusI || r2 > cSim.nonbondedCutoffSqr)
-    #else
-                        if (rj >= rScaledRadiusI)
-    #endif
-                        {
-                            dE                  = 0.0f;
-                        }
-                        dx                     *= dE;
-                        dy                     *= dE;
-                        dz                     *= dE;
-                        tempBuffer[threadIdx.x].x += dx;
-                        tempBuffer[threadIdx.x].y += dy;
-                        tempBuffer[threadIdx.x].z += dz;
-                        af.x                   -= dx;
-                        af.y                   -= dy;
-                        af.z                   -= dz;
-                        // Sum the forces on atom j.
-                        if (tgx % 2 == 0)
-                        {
-                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
-                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
-                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
-                        }
-                        if (tgx % 4 == 0)
-                        {
-                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
-                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
-                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
-                        }
-                        if (tgx % 8 == 0)
-                        {
-                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
-                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
-                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
-                        }
-                        if (tgx % 16 == 0)
-                        {
-                            tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
-                            tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
-                            tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
-                        }
-                        if (tgx == 0)
-                        {
-                            psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
-                            psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
-                            psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
-                        }
-                    }
-                }
-            }
-#endif
-            // Write results
-            float4 of;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset         = x + tgx + warp*cSim.stride;
-#else
-            unsigned int offset         = x + tgx + (y >> GRIDBITS) * cSim.stride;
-#endif
-            of                          = cSim.pForce4[offset];
-            of.x                       += af.x;
-            of.y                       += af.y;
-            of.z                       += af.z;
-            cSim.pForce4[offset]       = of;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            offset                      = y + tgx + warp*cSim.stride;
-#else
-            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
-#endif
-            of                          = cSim.pForce4[offset];
-            of.x                       += sA[threadIdx.x].fx;
-            of.y                       += sA[threadIdx.x].fy;
-            of.z                       += sA[threadIdx.x].fz;
-            cSim.pForce4[offset]       = of;
-        }
-        lasty = y;
-        pos++;
-    }
-}
--- a/platforms/cuda-old/src/kernels/kCalculatePME.cu
+++ b/platforms/cuda-old/src/kernels/kCalculatePME.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009-2010 Stanford University and the Authors.      *
- * Authors: Erik Lindahl, Rossen Apostolov, Szilard Pall, Peter Eastman       *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include "gputypes.h"
-#include "bbsort.h"
-#include <cuda.h>
-using namespace std;
-static __constant__ cudaGmxSimulation cSim;
-/* Cuda compiler on Windows does not recognized "static const float" values */
-#define LOCAL_HACK_PI 3.1415926535897932384626433832795
-void SetCalculatePMESim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-void GetCalculatePMESim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-texture<float4, 1, cudaReadModeElementType> bsplineThetaRef;
-inline __host__ __device__ int fast_mod(int a, int b)
-{
-    return (b & (b - 1)) ? a % b : a & (b - 1);
-}
-inline __host__ __device__ float4 make_float4(float s)
-{
-    return make_float4(s, s, s, s);
-}
-inline __host__ __device__ float4 operator-(float4 &a)
-{
-    return make_float4(-a.x, -a.y, -a.z, -a.w);
-}
-inline __host__ __device__ float4 operator-(float4 a, float4 b)
-{
-    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
-}
-inline __host__ __device__ float4 operator+(float4 a, float4 b)
-{
-    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
-}
-inline __host__ __device__ float4 operator+(float4 a, float b)
-{
-    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
-}
-inline __host__ __device__ float4 operator+(float a, float4 b)
-{
-    return make_float4(a + b.x, a + b.y, a + b.z,  a + b.w);
-}
-inline __host__ __device__ float4 operator*(float s, float4 a)
-{
-    return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
-}
-inline __host__ __device__ float4 operator*(float4 a, float4 b)
-{
-    return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w + b.w);
-}
-inline __host__ __device__ float4 make_float4(int3 a)
-{
-    return make_float4((float) a.x, (float) a.y, (float) a.z, 0);
-}
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(1024, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(512, 1)
-#else
-__launch_bounds__(256, 1)
-#endif
-void kUpdateBsplines_kernel()
-{
-    unsigned int    tnb = blockDim.x * gridDim.x;
-    unsigned int    tid = blockIdx.x * blockDim.x + threadIdx.x;
-    extern __shared__ float4 bsplines_cache[]; // size = 2 * block_size * pme_order
-    const float4 div_o   = make_float4(1.0f/(PME_ORDER - 1));
-    for (int i = tid; i < cSim.atoms; i += tnb)
-    {
-        float4* data    = &bsplines_cache[threadIdx.x*PME_ORDER];
-        float4* ddata   = &bsplines_cache[threadIdx.x*PME_ORDER + blockDim.x*PME_ORDER];
-        for (int j = 0; j < PME_ORDER; j++)
-        {
-	    data[j] = make_float4(0.0f);
-            ddata[j] = make_float4(0.0f);
-        }
-        float4 posq = cSim.pPosq[i];
-        posq.x -= floorf(posq.x*cSim.invPeriodicBoxSizeX)*cSim.periodicBoxSizeX;
-        posq.y -= floorf(posq.y*cSim.invPeriodicBoxSizeY)*cSim.periodicBoxSizeY;
-        posq.z -= floorf(posq.z*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
-        float3 t = make_float3((posq.x*cSim.invPeriodicBoxSizeX)*cSim.pmeGridSize.x,
-                               (posq.y*cSim.invPeriodicBoxSizeY)*cSim.pmeGridSize.y,
-                               (posq.z*cSim.invPeriodicBoxSizeZ)*cSim.pmeGridSize.z);
-        float4 dr = make_float4(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
-        int3 gridIndex = make_int3(((int) t.x) % cSim.pmeGridSize.x,
-                              ((int) t.y) % cSim.pmeGridSize.y,
-                              ((int) t.z) % cSim.pmeGridSize.z);
-        cSim.pPmeAtomGridIndex[i] = make_int2(i, gridIndex.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+gridIndex.y*cSim.pmeGridSize.z+gridIndex.z);
-        data[PME_ORDER - 1] = make_float4(0.0f);
-        data[1]            = dr;
-        data[0]            = make_float4(1.0f) - dr;
-        for (int j = 3; j < PME_ORDER; j++)
-        {
-            float div = 1.0f / ((float)j - 1.0f);
-            data[j - 1] = div * dr * data[j - 2];
-            for (int k = 1; k < (j - 1); k++)
-            {
-                data[j - k - 1] =
-                   div * (
-                           (dr + float(k))          * data[j - k - 2] +
-                           (-dr + ((float)(j - k))) * data[j - k - 1]);
-            }
-            data[0] = div * (- dr + 1) * data[0];
-        }
-        ddata[0] = -data[0];
-        for (int j = 1; j < PME_ORDER; j++)
-            ddata[j] = data[j - 1] - data[j];
-        data[PME_ORDER - 1] = div_o * dr * data[PME_ORDER - 2];
-        for (int j = 1; j < (PME_ORDER - 1); j++)
-        {
-            data[PME_ORDER - j - 1] =
-                div_o * (
-                    (dr + (float)j)                 * data[PME_ORDER - j - 2] +
-                    (-dr + ((float)(PME_ORDER - j))) * data[PME_ORDER - j - 1]
-                );
-        }
-        data[0] = div_o * (-dr + 1.0f) * data[0];
-        for (int j = 0; j < PME_ORDER; j++)
-        {
-            data[j].w = posq.w; // Storing the charge here improves cache coherency in the charge spreading kernel
-            cSim.pPmeBsplineTheta[i + j*cSim.atoms] =  data[j];
-            cSim.pPmeBsplineDtheta[i + j*cSim.atoms] = ddata[j];
-        }
-    }
-}
-/**
- * For each grid point, find the range of sorted atoms associated with that point.
- */
-__global__
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(1024, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(512, 1)
-#else
-__launch_bounds__(256, 1)
-#endif
-void kFindAtomRangeForGrid_kernel()
-{
-    int thread = blockIdx.x*blockDim.x+threadIdx.x;
-    int start = (cSim.atoms*thread)/(blockDim.x*gridDim.x);
-    int end = (cSim.atoms*(thread+1))/(blockDim.x*gridDim.x);
-    int last = (start == 0 ? -1 : cSim.pPmeAtomGridIndex[start-1].y);
-    for (int i = start; i < end; ++i)
-    {
-        int2 atomData = cSim.pPmeAtomGridIndex[i];
-        int gridIndex = atomData.y;
-        if (gridIndex != last)
-        {
-            for (int j = last+1; j <= gridIndex; ++j)
-                cSim.pPmeAtomRange[j] = i;
-            last = gridIndex;
-        }
-    }
-    // Fill in values beyond the last atom.
-    if (thread == blockDim.x*gridDim.x-1)
-    {
-        int gridSize = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
-        for (int j = last+1; j <= gridSize; ++j)
-            cSim.pPmeAtomRange[j] = cSim.atoms;
-    }
-}
-/**
- * The grid index won't be needed again.  Reuse that component to hold the z index, thus saving
- * some work in the charge spreading kernel.
- */
-__global__
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(1024, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(512, 1)
-#else
-__launch_bounds__(256, 1)
-#endif
-void kRecordZIndex_kernel()
-{
-    int thread = blockIdx.x*blockDim.x+threadIdx.x;
-    int start = (cSim.atoms*thread)/(blockDim.x*gridDim.x);
-    int end = (cSim.atoms*(thread+1))/(blockDim.x*gridDim.x);
-    for (int i = start; i < end; ++i)
-    {
-        int2 atomData = cSim.pPmeAtomGridIndex[i];
-        float posz = cSim.pPosq[atomData.x].z;
-        posz -= floorf(posz*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
-        int z = ((int) ((posz*cSim.invPeriodicBoxSizeZ)*cSim.pmeGridSize.z)) % cSim.pmeGridSize.z;
-        cSim.pPmeAtomGridIndex[i].y = z;
-    }
-}
-__global__
-void kGridSpreadCharge_kernel()
-{
-    unsigned int numGridPoints = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
-    unsigned int numThreads = gridDim.x*blockDim.x;
-    for (int gridIndex = blockIdx.x*blockDim.x+threadIdx.x; gridIndex < numGridPoints; gridIndex += numThreads)
-    {
-        int3 gridPoint;
-        gridPoint.x = gridIndex/(cSim.pmeGridSize.y*cSim.pmeGridSize.z);
-        int remainder = gridIndex-gridPoint.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
-        gridPoint.y = remainder/cSim.pmeGridSize.z;
-        gridPoint.z = remainder-gridPoint.y*cSim.pmeGridSize.z;
-        float result = 0.0f;
-        for (int ix = 0; ix < PME_ORDER; ++ix)
-        {
-            int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : cSim.pmeGridSize.x);
-            for (int iy = 0; iy < PME_ORDER; ++iy)
-            {
-                int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : cSim.pmeGridSize.y);
-                int z1 = gridPoint.z-PME_ORDER+1;
-                z1 += (z1 >= 0 ? 0 : cSim.pmeGridSize.z);
-                int z2 = (z1 < gridPoint.z ? gridPoint.z : cSim.pmeGridSize.z-1);
-                int gridIndex1 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z1;
-                int gridIndex2 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z2;
-                int firstAtom = cSim.pPmeAtomRange[gridIndex1];
-                int lastAtom = cSim.pPmeAtomRange[gridIndex2+1];
-                for (int i = firstAtom; i < lastAtom; ++i)
-                {
-                    int2 atomData = cSim.pPmeAtomGridIndex[i];
-                    int atomIndex = atomData.x;
-                    int z = atomData.y;
-                    int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
-                    float atomCharge = tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).w;
-                    result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z;
-                }
-                if (z1 > gridPoint.z)
-                {
-                    gridIndex1 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z;
-                    gridIndex2 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+gridPoint.z;
-                    firstAtom = cSim.pPmeAtomRange[gridIndex1];
-                    lastAtom = cSim.pPmeAtomRange[gridIndex2+1];
-                    for (int i = firstAtom; i < lastAtom; ++i)
-                    {
-                        int2 atomData = cSim.pPmeAtomGridIndex[i];
-                        int atomIndex = atomData.x;
-                        int z = atomData.y;
-                        int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
-                        float atomCharge = tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).w;
-                        result += atomCharge*tex1Dfetch(bsplineThetaRef, atomIndex+ix*cSim.atoms).x*tex1Dfetch(bsplineThetaRef, atomIndex+iy*cSim.atoms).y*tex1Dfetch(bsplineThetaRef, atomIndex+iz*cSim.atoms).z;
-                    }
-                }
-            }
-        }
-        cSim.pPmeGrid[gridIndex] = make_cuComplex(result*sqrtf(cSim.epsfac), 0.0f);
-    }
-}
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(1024, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(512, 1)
-#else
-__launch_bounds__(256, 1)
-#endif
-void kReciprocalConvolution_kernel()
-{
-    const unsigned int gridSize = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
-    float expFactor = LOCAL_HACK_PI*LOCAL_HACK_PI/(cSim.alphaEwald*cSim.alphaEwald);
-    float scaleFactor = 1.0/(LOCAL_HACK_PI*cSim.periodicBoxSizeX*cSim.periodicBoxSizeY*cSim.periodicBoxSizeZ);
-    float energy = 0.0f;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x)
-    {
-        int kx = index/(cSim.pmeGridSize.y*cSim.pmeGridSize.z);
-        int remainder = index-kx*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
-        int ky = remainder/cSim.pmeGridSize.z;
-        int kz = remainder-ky*cSim.pmeGridSize.z;
-        if (kx == 0 && ky == 0 && kz == 0)
-            continue;
-        int mx = (kx < (cSim.pmeGridSize.x+1)/2) ? kx : (kx-cSim.pmeGridSize.x);
-        int my = (ky < (cSim.pmeGridSize.y+1)/2) ? ky : (ky-cSim.pmeGridSize.y);
-        int mz = (kz < (cSim.pmeGridSize.z+1)/2) ? kz : (kz-cSim.pmeGridSize.z);
-        float mhx = mx*cSim.invPeriodicBoxSizeX;
-        float mhy = my*cSim.invPeriodicBoxSizeY;
-        float mhz = mz*cSim.invPeriodicBoxSizeZ;
-        float bx = cSim.pPmeBsplineModuli[0][kx];
-        float by = cSim.pPmeBsplineModuli[1][ky];
-        float bz = cSim.pPmeBsplineModuli[2][kz];
-        cuComplex grid = cSim.pPmeGrid[index];
-        float m2 = mhx*mhx+mhy*mhy+mhz*mhz;
-        float denom = m2*bx*by*bz;
-        float eterm = scaleFactor*exp(-expFactor*m2)/denom;
-        cSim.pPmeGrid[index] = make_cuComplex(grid.x*eterm, grid.y*eterm);
-        energy += eterm*(grid.x*grid.x + grid.y*grid.y);
-    }
-    cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
-}
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(1024, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(512, 1)
-#else
-__launch_bounds__(256, 1)
-#endif
-void kGridInterpolateForce_kernel()
-{
-    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < cSim.atoms; atom += blockDim.x*gridDim.x)
-    {
-        float3 force = make_float3(0.0f, 0.0f, 0.0f);
-        float4 posq = cSim.pPosq[atom];
-        posq.x -= floorf(posq.x*cSim.invPeriodicBoxSizeX)*cSim.periodicBoxSizeX;
-        posq.y -= floorf(posq.y*cSim.invPeriodicBoxSizeY)*cSim.periodicBoxSizeY;
-        posq.z -= floorf(posq.z*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
-        float3 t = make_float3((posq.x*cSim.invPeriodicBoxSizeX)*cSim.pmeGridSize.x,
-                               (posq.y*cSim.invPeriodicBoxSizeY)*cSim.pmeGridSize.y,
-                               (posq.z*cSim.invPeriodicBoxSizeZ)*cSim.pmeGridSize.z);
-        int3 gridIndex = make_int3(((int) t.x) % cSim.pmeGridSize.x,
-                              ((int) t.y) % cSim.pmeGridSize.y,
-                              ((int) t.z) % cSim.pmeGridSize.z);
-        for (int ix = 0; ix < PME_ORDER; ix++)
-        {
-            int xindex = gridIndex.x+ix;
-            xindex -= (xindex >= cSim.pmeGridSize.x ? cSim.pmeGridSize.x : 0);
-            float tx = cSim.pPmeBsplineTheta[atom+ix*cSim.atoms].x;
-            float dtx = cSim.pPmeBsplineDtheta[atom+ix*cSim.atoms].x;
-            for (int iy = 0; iy < PME_ORDER; iy++)
-            {
-                int yindex = gridIndex.y+iy;
-                yindex -= (yindex >= cSim.pmeGridSize.y ? cSim.pmeGridSize.y : 0);
-                float ty = cSim.pPmeBsplineTheta[atom+iy*cSim.atoms].y;
-                float dty = cSim.pPmeBsplineDtheta[atom+iy*cSim.atoms].y;
-                for (int iz = 0; iz < PME_ORDER; iz++)
-                {
-                    int zindex               = gridIndex.z+iz;
-                    zindex -= (zindex >= cSim.pmeGridSize.z ? cSim.pmeGridSize.z : 0);
-                    float tz = cSim.pPmeBsplineTheta[atom+iz*cSim.atoms].z;
-                    float dtz = cSim.pPmeBsplineDtheta[atom+iz*cSim.atoms].z;
-                    int index                = xindex*cSim.pmeGridSize.y*cSim.pmeGridSize.z + yindex*cSim.pmeGridSize.z + zindex;
-                    float gridvalue          = cSim.pPmeGrid[index].x;
-                    force.x                 += dtx*ty*tz*gridvalue;
-                    force.y                 += tx*dty*tz*gridvalue;
-                    force.z                 += tx*ty*dtz*gridvalue;
-                }
-            }
-        }
-        float4 totalForce = cSim.pForce4[atom];
-        float q = posq.w*sqrtf(cSim.epsfac);
-        totalForce.x -= q*force.x*cSim.pmeGridSize.x*cSim.invPeriodicBoxSizeX;
-        totalForce.y -= q*force.y*cSim.pmeGridSize.y*cSim.invPeriodicBoxSizeY;
-        totalForce.z -= q*force.z*cSim.pmeGridSize.z*cSim.invPeriodicBoxSizeZ;
-        cSim.pForce4[atom] = totalForce;
-    }
-}
-void kCalculatePME(gpuContext gpu)
-{
-//    printf("kCalculatePME\n");
-    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>();
-    cudaBindTexture(NULL, &bsplineThetaRef, gpu->psPmeBsplineTheta->_pDevData, &channelDesc, gpu->psPmeBsplineTheta->_length*sizeof(float4));
-    unsigned int threads = 16380/(2*PME_ORDER*sizeof(float4));
-    kUpdateBsplines_kernel<<<gpu->sim.blocks, threads, 2*threads*PME_ORDER*sizeof(float4)>>>();
-    LAUNCHERROR("kUpdateBsplines");
-    bbSort(gpu->psPmeAtomGridIndex->_pDevData, gpu->natoms);
-    kFindAtomRangeForGrid_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
-    LAUNCHERROR("kFindAtomRangeForGrid");
-    kRecordZIndex_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
-    LAUNCHERROR("kRecordZIndex");
-    kGridSpreadCharge_kernel<<<16*gpu->sim.blocks, 64>>>();
-    LAUNCHERROR("kGridSpreadCharge");
-    cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_FORWARD);
-    kReciprocalConvolution_kernel<<<gpu->sim.blocks, gpu->sim.nonbond_threads_per_block>>>();
-    LAUNCHERROR("kReciprocalConvolution");
-    cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_INVERSE);
-    kGridInterpolateForce_kernel<<<2*gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
-    LAUNCHERROR("kGridInterpolateForce");
-}
--- a/platforms/cuda-old/src/kernels/kEvaluateExpression.h
+++ b/platforms/cuda-old/src/kernels/kEvaluateExpression.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-/**
- * This file contains the routine for evaluating a custom expression.
- */
-static __constant__ float globalParams[8];
-texture<float4, 1, cudaReadModeElementType> texRef0;
-texture<float4, 1, cudaReadModeElementType> texRef1;
-texture<float4, 1, cudaReadModeElementType> texRef2;
-texture<float4, 1, cudaReadModeElementType> texRef3;
-#define STACK(y) stack[(y)*blockDim.x+threadIdx.x]
-#define VARIABLE(y) variables[(y)*blockDim.x+threadIdx.x]
-template<int SIZE>
-__device__ float kEvaluateExpression_kernel(Expression<SIZE>* expression, float* stack, float* variables)
-{
-    int stackPointer = -1;
-    for (int i = 0; i < expression->length; i++)
-    {
-        int op = expression->op[i];
-        if (op < MULTIPLY) {
-            STACK(++stackPointer) = VARIABLE(op-VARIABLE0);
-        }
-        else if (op < NEGATE) {
-            if (op < MULTIPLY_CONSTANT) {
-                if (op == MULTIPLY) {
-                    float temp = STACK(stackPointer);
-                    STACK(--stackPointer) *= temp;
-                }
-                else if (op == DIVIDE) {
-                    float temp = STACK(stackPointer);
-                    STACK(stackPointer) = temp/STACK(--stackPointer);
-                }
-                else if (op == ADD) {
-                    float temp = STACK(stackPointer);
-                    STACK(--stackPointer) += temp;
-                }
-                else if (op == SUBTRACT) {
-                    float temp = STACK(stackPointer);
-                    STACK(stackPointer) = temp-STACK(--stackPointer);
-                }
-                else /*if (op == POWER)*/ {
-                    float temp = STACK(stackPointer);
-                    STACK(stackPointer) = pow(temp, STACK(--stackPointer));
-                }
-            }
-            else if (op < GLOBAL) {
-                if (op == MULTIPLY_CONSTANT) {
-                    STACK(stackPointer) *= expression->arg[i];
-                }
-                else if (op == POWER_CONSTANT) {
-                    STACK(stackPointer) = pow(STACK(stackPointer), expression->arg[i]);
-                }
-                else /*if (op == ADD_CONSTANT)*/ {
-                    STACK(stackPointer) += expression->arg[i];
-                }
-            }
-            else {
-                if (op == GLOBAL) {
-                    STACK(++stackPointer) = globalParams[(int) expression->arg[i]];
-                }
-                else if (op == CONSTANT) {
-                    STACK(++stackPointer) = expression->arg[i];
-                }
-                else /*if (op == CUSTOM || op == CUSTOM_DERIV)*/ {
-                    int function = (int) expression->arg[i];
-                    float x = STACK(stackPointer);
-                    float4 params = cSim.pTabulatedFunctionParams[function];
-                    if (x < params.x || x > params.y)
-                        STACK(stackPointer) = 0.0f;
-                    else
-                    {
-                        x = (x-params.x)*params.z;
-                        int index = floor(x);
-                        index = min(index, (int) params.w);
-                        float4 coeff;
-                        if (function == 0)
-                            coeff = tex1Dfetch(texRef0, index);
-                        else if (function == 1)
-                            coeff = tex1Dfetch(texRef1, index);
-                        else if (function == 2)
-                            coeff = tex1Dfetch(texRef2, index);
-                        else
-                            coeff = tex1Dfetch(texRef3, index);
-                        float b = x-index;
-                        float a = 1.0f-b;
-                        if (op == CUSTOM)
-                            STACK(stackPointer) = a*coeff.x+b*coeff.y+((a*a*a-a)*coeff.z+(b*b*b-b)*coeff.w)/(params.z*params.z);
-                        else
-                            STACK(stackPointer) = (coeff.y-coeff.x)*params.z+((1.0f-3.0f*a*a)*coeff.z+(3.0f*b*b-1.0f)*coeff.w)/params.z;
-                    }
-                }
-            }
-        }
-        else {
-            if (op < SIN) {
-                if (op == NEGATE) {
-                    STACK(stackPointer) *= -1.0f;
-                }
-                else if (op == RECIPROCAL) {
-                    STACK(stackPointer) = 1.0f/STACK(stackPointer);
-                }
-                else if (op == SQRT) {
-                    STACK(stackPointer) = sqrt(STACK(stackPointer));
-                }
-                else if (op == EXP) {
-                    STACK(stackPointer) = exp(STACK(stackPointer));
-                }
-                else if (op == LOG) {
-                    STACK(stackPointer) = log(STACK(stackPointer));
-                }
-                else if (op == SQUARE) {
-                    float temp = STACK(stackPointer);
-                    STACK(stackPointer) *= temp;
-                }
-                else if (op == CUBE) {
-                    float temp = STACK(stackPointer);
-                    STACK(stackPointer) *= temp*temp;
-                }
-                else /*if (op == STEP)*/ {
-                    STACK(stackPointer) = (STACK(stackPointer) >= 0.0f ? 1.0f : 0.0f);
-                }
-            }
-            else {
-                if (op == SIN) {
-                    STACK(stackPointer) = sin(STACK(stackPointer));
-                }
-                else if (op == COS) {
-                    STACK(stackPointer) = cos(STACK(stackPointer));
-                }
-                else if (op == SEC) {
-                    STACK(stackPointer) = 1.0f/cos(STACK(stackPointer));
-                }
-                else if (op == CSC) {
-                    STACK(stackPointer) = 1.0f/sin(STACK(stackPointer));
-                }
-                else if (op == TAN) {
-                    STACK(stackPointer) = tan(STACK(stackPointer));
-                }
-                else if (op == COT) {
-                    STACK(stackPointer) = 1.0f/tan(STACK(stackPointer));
-                }
-                else if (op == ASIN) {
-                    STACK(stackPointer) = asin(STACK(stackPointer));
-                }
-                else if (op == ACOS) {
-                    STACK(stackPointer) = acos(STACK(stackPointer));
-                }
-                else if (op == ATAN) {
-                    STACK(stackPointer) = atan(STACK(stackPointer));
-                }
-                else if (op == SINH) {
-                    STACK(stackPointer) = sinh(STACK(stackPointer));
-                }
-                else if (op == COSH) {
-                    STACK(stackPointer) = cosh(STACK(stackPointer));
-                }
-                else if (op == TANH) {
-                    STACK(stackPointer) = tanh(STACK(stackPointer));
-                }
-                else if (op == ERF) {
-                    STACK(stackPointer) = erf(STACK(stackPointer));
-                }
-                else if (op == ERFC) {
-                    STACK(stackPointer) = erfc(STACK(stackPointer));
-                }
-                else if (op == MIN) {
-                    float temp = STACK(stackPointer);
-                    STACK(stackPointer) = min(temp, STACK(--stackPointer));
-                }
-                else if (op == MAX) {
-                    float temp = STACK(stackPointer);
-                    STACK(stackPointer) = max(temp, STACK(--stackPointer));
-                }
-                else /*if (op == ABS)*/ {
-                    STACK(stackPointer) = fabs(STACK(stackPointer));
-                }
-            }
-        }
-    }
-    return STACK(stackPointer);
-}
--- a/platforms/cuda-old/src/kernels/kFindInteractingBlocks.h
+++ b/platforms/cuda-old/src/kernels/kFindInteractingBlocks.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-/**
- * This file contains the kernels for identifying interacting blocks.  It is included
- * several times in kCalculateCDLJForces.cu with different #defines to generate
- * different versions of the kernels.
- */
-/**
- * Find a bounding box for the atoms in each block.
- */
-__global__ void METHOD_NAME(kFindBlockBounds, _kernel)()
-{
-    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int base = pos << GRIDBITS;
-    if (base < cSim.atoms)
-    {
-        float4 apos = cSim.pPosq[base];
-#ifdef USE_PERIODIC
-        apos.x -= floorf(apos.x*cSim.invPeriodicBoxSizeX)*cSim.periodicBoxSizeX;
-        apos.y -= floorf(apos.y*cSim.invPeriodicBoxSizeY)*cSim.periodicBoxSizeY;
-        apos.z -= floorf(apos.z*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
-        float4 firstPoint = apos;
-#endif
-        float minx = apos.x;
-        float maxx = apos.x;
-        float miny = apos.y;
-        float maxy = apos.y;
-        float minz = apos.z;
-        float maxz = apos.z;
-        for (unsigned int i = 1; i < GRID; i++)
-        {
-            apos = cSim.pPosq[base+i];
-#ifdef USE_PERIODIC
-            apos.x -= floorf((apos.x-firstPoint.x)*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-            apos.y -= floorf((apos.y-firstPoint.y)*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-            apos.z -= floorf((apos.z-firstPoint.z)*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-            minx = min(minx, apos.x);
-            maxx = max(maxx, apos.x);
-            miny = min(miny, apos.y);
-            maxy = max(maxy, apos.y);
-            minz = min(minz, apos.z);
-            maxz = max(maxz, apos.z);
-        }
-        cSim.pGridBoundingBox[pos] = make_float4(0.5f*(maxx-minx), 0.5f*(maxy-miny), 0.5f*(maxz-minz), 0);
-        cSim.pGridCenter[pos] = make_float4(0.5f*(maxx+minx), 0.5f*(maxy+miny), 0.5f*(maxz+minz), 0);
-    }
-}
-/**
- * Compare the bounding boxes for each pair of blocks.  If they are sufficiently far apart,
- * mark them as non-interacting.
- */
-__global__ void METHOD_NAME(kFindBlocksWithInteractions, _kernel)()
-{
-    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    while (pos < cSim.workUnits)
-    {
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x = cSim.pWorkUnit[pos];
-        unsigned int y = ((x >> 2) & 0x7fff);
-        x = (x >> 17);
-        // Find the distance between the bounding boxes of the two cells.
-        float4 centera = cSim.pGridCenter[x];
-        float4 centerb = cSim.pGridCenter[y];
-        float dx = centera.x-centerb.x;
-        float dy = centera.y-centerb.y;
-        float dz = centera.z-centerb.z;
-#ifdef USE_PERIODIC
-        dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-        dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-        dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-        float4 boxSizea = cSim.pGridBoundingBox[x];
-        float4 boxSizeb = cSim.pGridBoundingBox[y];
-        dx = max(0.0f, abs(dx)-boxSizea.x-boxSizeb.x);
-        dy = max(0.0f, abs(dy)-boxSizea.y-boxSizeb.y);
-        dz = max(0.0f, abs(dz)-boxSizea.z-boxSizeb.z);
-        cSim.pInteractionFlag[pos] = (dx*dx+dy*dy+dz*dz > cSim.nonbondedCutoffSqr ? 0 : 1);
-        pos += gridDim.x*blockDim.x;
-    }
-}
-/**
- * Compare each atom in one block to the bounding box of another block, and set
- * flags for which ones are interacting.
- */
-__global__ void METHOD_NAME(kFindInteractionsWithinBlocks, _kernel)(unsigned int* workUnit)
-{
-    extern __shared__ volatile unsigned int flags[];
-    unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    unsigned int numWorkUnits = cSim.pInteractionCount[0];
-    unsigned int pos = warp*numWorkUnits/totalWarps;
-    unsigned int end = (warp+1)*numWorkUnits/totalWarps;
-    unsigned int index = threadIdx.x & (GRID - 1);
-    unsigned int lasty = 0xFFFFFFFF;
-    float4 apos;
-    while (pos < end)
-    {
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x = workUnit[pos];
-        unsigned int y = ((x >> 2) & 0x7fff);
-        bool bExclusionFlag = (x & 0x1);
-        x = (x >> 17);
-        if (x == y || bExclusionFlag)
-        {
-            // Assume this block will be dense.
-            if (index == 0)
-                cSim.pInteractionFlag[pos] = 0xFFFFFFFF;
-        }
-        else
-        {
-            // Load the bounding box for x and the atom positions for y.
-            float4 center = cSim.pGridCenter[x];
-            float4 boxSize = cSim.pGridBoundingBox[x];
-            if (y != lasty)
-            {
-                apos = cSim.pPosq[(y<<GRIDBITS)+index];
-            }
-            // Find the distance of the atom from the bounding box.
-            float dx = apos.x-center.x;
-            float dy = apos.y-center.y;
-            float dz = apos.z-center.z;
-#ifdef USE_PERIODIC
-            dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-            dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-            dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-            dx = max(0.0f, abs(dx)-boxSize.x);
-            dy = max(0.0f, abs(dy)-boxSize.y);
-            dz = max(0.0f, abs(dz)-boxSize.z);
-            flags[threadIdx.x] = (dx*dx+dy*dy+dz*dz > cSim.nonbondedCutoffSqr ? 0 : 1 << index);
-            // Sum the flags.
-            if (index % 2 == 0)
-                flags[threadIdx.x] += flags[threadIdx.x+1];
-            if (index % 4 == 0)
-                flags[threadIdx.x] += flags[threadIdx.x+2];
-            if (index % 8 == 0)
-                flags[threadIdx.x] += flags[threadIdx.x+4];
-            if (index % 16 == 0)
-                flags[threadIdx.x] += flags[threadIdx.x+8];
-            if (index == 0)
-            {
-                unsigned int allFlags = flags[threadIdx.x] + flags[threadIdx.x+16];
-                // Count how many flags are set, and based on that decide whether to compute all interactions
-                // or only a fraction of them.
-                unsigned int bits = (allFlags&0x55555555) + ((allFlags>>1)&0x55555555);
-                bits = (bits&0x33333333) + ((bits>>2)&0x33333333);
-                bits = (bits&0x0F0F0F0F) + ((bits>>4)&0x0F0F0F0F);
-                bits = (bits&0x00FF00FF) + ((bits>>8)&0x00FF00FF);
-                bits = (bits&0x0000FFFF) + ((bits>>16)&0x0000FFFF);
-                cSim.pInteractionFlag[pos] = (bits > 12 ? 0xFFFFFFFF : allFlags);
-            }
-            lasty = y;
-        }
-        pos++;
-    }
-}
--- a/platforms/cuda-old/src/kernels/kForces.cu
+++ b/platforms/cuda-old/src/kernels/kForces.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-#include "gputypes.h"
-#define FABS(a) ((a) > 0.0f ? (a) : -(a))
-static __constant__ cudaGmxSimulation cSim;
-void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetForcesSim copy to cSim failed");
-}
-void GetForcesSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: GetForcesSim copy from cSim failed");
-}
-__global__ 
-__launch_bounds__(384, 1)
-void kClearForces_kernel()
-{
-    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    while (pos < cSim.stride * cSim.outputBuffers)
-    {
-        cSim.pForce4[pos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-        pos += gridDim.x * blockDim.x;
-    }
-}
-void OPENMMCUDA_EXPORT kClearForces(gpuContext gpu)
-{
-//    printf("kClearForces\n");
-    kClearForces_kernel<<<gpu->sim.blocks, 384>>>();
-    LAUNCHERROR("kClearForces");
-}
-__global__ 
-__launch_bounds__(384, 1)
-void kClearBornSumAndForces_kernel()
-{
-    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    while (pos < cSim.stride * cSim.nonbondOutputBuffers)
-    {
-        cSim.pBornSum[pos] = 0.0f;
-        cSim.pBornForce[pos] = 0.0f;
-        cSim.pForce4[pos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-        pos += gridDim.x * blockDim.x;
-    }
-    while (pos < cSim.stride * cSim.outputBuffers)
-    {
-        cSim.pForce4[pos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-        pos += gridDim.x * blockDim.x;
-    }
-}
-void kClearBornSumAndForces(gpuContext gpu)
-{
-  //  printf("kClearBornSumAndForces\n");
-    kClearBornSumAndForces_kernel<<<gpu->sim.blocks, 384>>>();
-    LAUNCHERROR("kClearBornSumAndForces");
-}
-__global__ 
-__launch_bounds__(384, 1)
-void kClearEnergy_kernel()
-{
-    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    while (pos < cSim.energyOutputBuffers)
-    {
-        ((float*)cSim.pEnergy)[pos] = 0.0f;
-        pos += gridDim.x * blockDim.x;
-    }
-}
-void kClearEnergy(gpuContext gpu)
-{
-  //  printf("kClearEnergy\n");
-    kClearEnergy_kernel<<<gpu->sim.blocks, 384>>>();
-    LAUNCHERROR("kClearEnergy");
-}
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void kReduceBornSumAndForces_kernel()
-{
-    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
-    // Reduce forces
-    while (pos < cSim.stride4)
-    {
-        float totalForce = 0.0f;
-        float* pFt = (float*)cSim.pForce4 + pos;
-        int i = cSim.outputBuffers;
-        while (i >= 4)
-        {
-            float f1    = *pFt;
-            pFt        += cSim.stride4;
-            float f2    = *pFt;
-            pFt        += cSim.stride4;
-            float f3    = *pFt;
-            pFt        += cSim.stride4;
-            float f4    = *pFt;
-            pFt        += cSim.stride4;
-            totalForce += f1 + f2 + f3 + f4;
-            i -= 4;
-        }
-        if (i >= 2)
-        {
-            float f1    = *pFt;
-            pFt        += cSim.stride4;
-            float f2    = *pFt;
-            pFt        += cSim.stride4;
-            totalForce += f1 + f2;
-            i -= 2;
-        }
-        if (i > 0)
-        {
-            totalForce += *pFt;
-        }
-        pFt = (float*)cSim.pForce4 + pos;
-        *pFt = totalForce;
-        pos += gridDim.x * blockDim.x;
-    }   
-    // Reduce Born Sum
-    while (pos - cSim.stride4 < cSim.atoms)
-    {
-        float sum = 0.0f;
-        float* pSt = cSim.pBornSum + pos - cSim.stride4;
-        float2 atom = cSim.pObcData[pos - cSim.stride4];
-        // Get summed Born data
-        int i = cSim.nonbondOutputBuffers;
-        while (i >= 4)
-        {
-            float f1    = *pSt;
-            pSt        += cSim.stride;
-            float f2    = *pSt;
-            pSt        += cSim.stride;
-            float f3    = *pSt;
-            pSt        += cSim.stride;
-            float f4    = *pSt;
-            pSt        += cSim.stride;
-            sum += f1 + f2 + f3 + f4;
-            i -= 4;
-        }
-        if (i >= 2)
-        {
-            float f1    = *pSt;
-            pSt        += cSim.stride;
-            float f2    = *pSt;
-            pSt        += cSim.stride;
-            sum += f1 + f2;
-            i -= 2;
-        }
-        if (i > 0)
-        {
-            sum += *pSt;
-        }
-        // Now calculate Born radius and OBC term.
-        cSim.pBornSum[pos - cSim.stride4] = sum; 
-        sum                    *= 0.5f * atom.x;
-        float sum2              = sum * sum;
-        float sum3              = sum * sum2;
-        float tanhSum           = tanh(cSim.alphaOBC * sum - cSim.betaOBC * sum2 + cSim.gammaOBC * sum3);
-        float nonOffsetRadii    = atom.x + cSim.dielectricOffset;
-        float bornRadius        = 1.0f / (1.0f / atom.x - tanhSum / nonOffsetRadii); 
-        float obcChain          = atom.x * (cSim.alphaOBC - 2.0f * cSim.betaOBC * sum + 3.0f * cSim.gammaOBC * sum2);
-        obcChain                = (1.0f - tanhSum * tanhSum) * obcChain / nonOffsetRadii;              
-        cSim.pBornRadii[pos - cSim.stride4] = bornRadius;
-        cSim.pObcChain[pos - cSim.stride4]  = obcChain;
-        pos += gridDim.x * blockDim.x;
-    }
-}
-void kReduceBornSumAndForces(gpuContext gpu)
-{
-    fprintf( stderr, "kReduceBornSumAndForces\n");
-    kReduceBornSumAndForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
-    LAUNCHERROR("kReduceBornSumAndForces");
-}
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void kReduceForces_kernel()
-{
-    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
-    // Reduce forces
-    while (pos < cSim.stride4)
-    {
-        float totalForce = 0.0f;
-        float* pFt = (float*)cSim.pForce4 + pos;
-        int i = cSim.outputBuffers;
-        while (i >= 4)
-        {
-            float f1    = *pFt;
-            pFt        += cSim.stride4;
-            float f2    = *pFt;
-            pFt        += cSim.stride4;
-            float f3    = *pFt;
-            pFt        += cSim.stride4;
-            float f4    = *pFt;
-            pFt        += cSim.stride4;
-            totalForce += f1 + f2 + f3 + f4;
-            i -= 4;
-        }
-        if (i >= 2)
-        {
-            float f1    = *pFt;
-            pFt        += cSim.stride4;
-            float f2    = *pFt;
-            pFt        += cSim.stride4;
-            totalForce += f1 + f2;
-            i -= 2;
-        }
-        if (i > 0)
-        {
-            totalForce += *pFt;
-        }
-        pFt = (float*)cSim.pForce4 + pos;
-        *pFt = totalForce;
-        pos += gridDim.x * blockDim.x;
-    }   
-}
-void OPENMMCUDA_EXPORT kReduceForces(gpuContext gpu)
-{
- //   printf("kReduceForces\n");
-    kReduceForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
-    LAUNCHERROR("kReduceForces");
-}
-double kReduceEnergy(gpuContext gpu)
-{
-    //printf("kReduceEnergy\n");
-    gpu->psEnergy->Download();
-    double sum = 0.0;
-    for (int i = 0; i < static_cast<int>(gpu->sim.energyOutputBuffers); i++){
-        sum += (*gpu->psEnergy)[i];
-    }
-    return sum;
-}
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void kReduceObcGbsaBornForces_kernel()
-{
-    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
-    float energy = 0.0f;
-    while (pos < cSim.atoms)
-    {
-        float bornRadius = cSim.pBornRadii[pos];
-        float obcChain   = cSim.pObcChain[pos];
-        float2 obcData   = cSim.pObcData[pos];
-        float totalForce = 0.0f;
-        float* pFt = cSim.pBornForce + pos;
-        int i = cSim.nonbondOutputBuffers;
-        while (i >= 4)
-        {
-            float f1    = *pFt;
-            pFt        += cSim.stride;
-            float f2    = *pFt;
-            pFt        += cSim.stride;
-            float f3    = *pFt;
-            pFt        += cSim.stride;
-            float f4    = *pFt;
-            pFt        += cSim.stride;
-            totalForce += f1 + f2 + f3 + f4;
-            i -= 4;
-        }
-        if (i >= 2)
-        {
-            float f1    = *pFt;
-            pFt        += cSim.stride;
-            float f2    = *pFt;
-            pFt        += cSim.stride;
-            totalForce += f1 + f2;
-            i -= 2;
-        }
-        if (i > 0)
-        {
-            totalForce += *pFt;
-        }
-        float r            = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
-        float ratio6       = powf((obcData.x + cSim.dielectricOffset) / bornRadius, 6.0f);
-        float saTerm       = cSim.surfaceAreaFactor * r * r * ratio6;
-        totalForce        += saTerm / bornRadius;
-        totalForce        *= bornRadius * bornRadius * obcChain;
-        energy            += saTerm;
-        pFt                = cSim.pBornForce + pos;
-        *pFt               = totalForce;
-        pos               += gridDim.x * blockDim.x;
-    }
-    // correct for surface area factor of -6
-    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f;
-}
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
-#endif
-void kReduceGBVIBornForces_kernel()
-{
-    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
-    float energy = 0.0f;
-    while (pos < cSim.atoms)
-    {
-        float bornRadius   = cSim.pBornRadii[pos];
-        float4 gbviData    = cSim.pGBVIData[pos];
-        float  switchDeriv = cSim.pGBVISwitchDerivative[pos];
-        float totalForce   = 0.0f;
-        float* pFt         = cSim.pBornForce + pos;
-        int i = cSim.nonbondOutputBuffers;
-        while (i >= 4)
-        {
-            float f1    = *pFt;
-            pFt        += cSim.stride;
-            float f2    = *pFt;
-            pFt        += cSim.stride;
-            float f3    = *pFt;
-            pFt        += cSim.stride;
-            float f4    = *pFt;
-            pFt        += cSim.stride;
-            totalForce += f1 + f2 + f3 + f4;
-            i -= 4;
-        }
-        if (i >= 2)
-        {
-            float f1    = *pFt;
-            pFt        += cSim.stride;
-            float f2    = *pFt;
-            pFt        += cSim.stride;
-            totalForce += f1 + f2;
-            i -= 2;
-        }
-        if (i > 0)
-        {
-            totalForce += *pFt;
-        }
-        float ratio         = (gbviData.x/bornRadius);
-        float ratio3        = ratio*ratio*ratio;
-        energy             -= gbviData.z*ratio3;
-        totalForce         += (3.0f*gbviData.z*ratio3)/bornRadius; // 'cavity' term
-        float br2           = bornRadius*bornRadius;
-        totalForce         *= (1.0f/3.0f)*br2*br2*switchDeriv;
-        pFt = cSim.pBornForce + pos;
-        *pFt = totalForce;
-        pos += gridDim.x * blockDim.x;
-    }
-    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy;
-}
-void kReduceObcGbsaBornForces(gpuContext gpu)
-{
-    if( gpu->bIncludeGBSA ){
-       kReduceObcGbsaBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
-       LAUNCHERROR("kReduceObcGbsaBornForces");
-    } else if( gpu->bIncludeGBVI ){
-       kReduceGBVIBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
-       LAUNCHERROR("kReduceGBVIBornForces");
-    }   
-}
--- a/platforms/cuda-old/src/kernels/kLangevinUpdate.cu
+++ b/platforms/cuda-old/src/kernels/kLangevinUpdate.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-using namespace std;
-#include "gputypes.h"
-enum {VelScale, ForceScale, NoiseScale, MaxParams};
-static __constant__ cudaGmxSimulation cSim;
-void SetLangevinUpdateSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-void GetLangevinUpdateSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-// Include versions of the kernels with and with center of mass motion removal.
-#include "kLangevinUpdate.h"
-#define REMOVE_CM
-#include "kLangevinUpdate.h"
-void kLangevinUpdatePart1(gpuContext gpu)
-{
-//    printf("kLangevinUpdatePart1\n");
-    if (gpu->bRemoveCM)
-    {
-        kLangevinUpdatePart1CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
-        LAUNCHERROR("kLangevinUpdatePart1CM");
-        gpu->bRemoveCM = false;
-    }
-    else
-    {    
-        kLangevinUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
-        LAUNCHERROR("kLangevinUpdatePart1");
-    }
-}
-extern void kGenerateRandoms(gpuContext gpu);
-void kLangevinUpdatePart2(gpuContext gpu)
-{
-//    printf("kLangevinUpdatePart2\n");
-    if (gpu->bCalculateCM)
-    {
-        kLangevinUpdatePart2CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
-        LAUNCHERROR("kLangevinUpdatePart2CM");
-        gpu->bCalculateCM = false;
-        gpu->bRemoveCM = true;
-    }
-    else
-    {
-        kLangevinUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
-        LAUNCHERROR("kLangevinUpdatePart2");
-    }
-    // Update randoms if necessary
-    gpu->iterations++;
-    if (gpu->iterations == gpu->sim.randomIterations)
-    {
-        kGenerateRandoms(gpu);
-        gpu->iterations = 0;
-    }
-}
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
-#endif
-void kSelectLangevinStepSize_kernel(float maxStepSize)
-{
-    // Calculate the error.
-    extern __shared__ float error[];
-    __shared__ float params[MaxParams];
-    error[threadIdx.x] = 0.0f;
-    unsigned int pos = threadIdx.x;
-    while (pos < cSim.atoms)
-    {
-        float4 force  = cSim.pForce4[pos];
-        float invMass = cSim.pVelm4[pos].w;
-        error[threadIdx.x] += (force.x*force.x + force.y*force.y + force.z*force.z)*invMass;
-        pos += blockDim.x * gridDim.x;
-    }
-    __syncthreads();
-    // Sum the errors from all threads.
-    for (int offset = 1; offset < blockDim.x; offset *= 2)
-    {
-        if (threadIdx.x+offset < blockDim.x && (threadIdx.x&(2*offset-1)) == 0)
-            error[threadIdx.x] += error[threadIdx.x+offset];
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-    {
-        // Select the new step size.
-        float totalError = sqrt(error[0]/(cSim.atoms*3));
-        float newStepSize = sqrt(cSim.errorTol/totalError);
-        float oldStepSize = cSim.pStepSize[0].y;
-        if (oldStepSize > 0.0f)
-            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
-        if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
-            newStepSize = oldStepSize; // Keeping dt constant between steps improves the behavior of the integrator.
-        if (newStepSize > maxStepSize)
-            newStepSize = maxStepSize;
-        cSim.pStepSize[0].y = newStepSize;
-        // Recalculate the integration parameters.
-        float vscale = exp(-newStepSize/cSim.tau);
-        float fscale = (1-vscale)*cSim.tau;
-        float noisescale = sqrt(2*cSim.kT/cSim.tau)*sqrt(0.5f*(1-vscale*vscale)*cSim.tau);
-        params[VelScale] = vscale;
-        params[ForceScale] = fscale;
-        params[NoiseScale] = noisescale;
-    }
-    __syncthreads();
-    if (threadIdx.x < MaxParams)
-        cSim.pLangevinParameters[threadIdx.x] = params[threadIdx.x];
-}
-void kSelectLangevinStepSize(gpuContext gpu, float maxTimeStep)
-{
-//    printf("kSelectLangevinStepSize\n");
-    kSelectLangevinStepSize_kernel<<<1, gpu->sim.update_threads_per_block, sizeof(float)*gpu->sim.update_threads_per_block>>>(maxTimeStep);
-    LAUNCHERROR("kSelectLangevinStepSize");
-}
-__global__
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
-#endif
-void kSetVelocitiesFromPositions_kernel()
-{
-    float2 stepSize = cSim.pStepSize[0];
-    double oneOverDt = 2.0/(stepSize.x+stepSize.y);
-    unsigned int pos = threadIdx.x;
-    while (pos < cSim.atoms)
-    {
-        float4 posq = cSim.pPosq[pos];
-        float4 posqP = cSim.pPosqP[pos];
-        float4 velm = cSim.pVelm4[pos];
-        velm.x = (float) (oneOverDt*posqP.x);
-        velm.y = (float) (oneOverDt*posqP.y);
-        velm.z = (float) (oneOverDt*posqP.z);
-        cSim.pVelm4[pos] = velm;
-        posq.x += posqP.x;
-        posq.y += posqP.y;
-        posq.z += posqP.z;
-        cSim.pPosq[pos] = posq;
-        pos += blockDim.x * gridDim.x;
-    }
-}
-void kSetVelocitiesFromPositions(gpuContext gpu)
-{
-//    printf("kSetVelocitiesFromPositions\n");
-    kSetVelocitiesFromPositions_kernel<<<1, gpu->sim.update_threads_per_block, sizeof(float)*gpu->sim.update_threads_per_block>>>();
-    LAUNCHERROR("kSetVelocitiesFromPositions");
-}
--- a/platforms/cuda-old/src/kernels/kLangevinUpdate.h
+++ b/platforms/cuda-old/src/kernels/kLangevinUpdate.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include "cudatypes.h"
-/**
- * This file contains the kernels for Langevin integration.  It is included
- * several times in kLangevinUpdate.cu with different #defines to generate
- * different versions of the kernels.
- */
-__global__
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
-#endif
-#ifdef REMOVE_CM
-void kLangevinUpdatePart1CM_kernel()
-#else
-void kLangevinUpdatePart1_kernel()
-#endif
-{
-    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
-    unsigned int rpos   = cSim.pRandomPosition[blockIdx.x];
-    float vscale = cSim.pLangevinParameters[VelScale];
-    float fscale = cSim.pLangevinParameters[ForceScale];
-    float noisescale = cSim.pLangevinParameters[NoiseScale];
-#ifdef REMOVE_CM
-    extern __shared__ float3 sCM[];
-    float3 CM           = { 0.0f, 0.0f, 0.0f};
-    float4 CM1          = { 0.0f, 0.0f, 0.0f, 0.0f };
-    // Read CM outputs from previous step
-    unsigned int cpos = threadIdx.x;
-    while (cpos < gridDim.x)
-    {
-        CM1             = cSim.pLinearMomentum[cpos];
-        CM.x           += CM1.x;
-        CM.y           += CM1.y;
-        CM.z           += CM1.z;
-        cpos           += blockDim.x;
-    }
-    sCM[threadIdx.x].x  = CM.x;
-    sCM[threadIdx.x].y  = CM.y;
-    sCM[threadIdx.x].z  = CM.z;
-    __syncthreads();
-    // Reduce CM
-    unsigned int offset = 1;
-    unsigned int mask   = 1;
-    while (offset < blockDim.x)
-    {
-        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
-        {
-            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
-            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
-            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
-        }
-        mask = 2 * mask + 1;
-        offset *= 2;
-        __syncthreads();
-    }
-#endif
-    while (pos < cSim.atoms)
-    {
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 random4a         = cSim.pRandom4[rpos + pos];
-        float4 force            = cSim.pForce4[pos];
-        float sqrtInvMass       = sqrt(velocity.w);
-        velocity.x = vscale*velocity.x + fscale*velocity.w*force.x + noisescale*sqrtInvMass*random4a.x;
-        velocity.y = vscale*velocity.y + fscale*velocity.w*force.y + noisescale*sqrtInvMass*random4a.y;
-        velocity.z = vscale*velocity.z + fscale*velocity.w*force.z + noisescale*sqrtInvMass*random4a.z;
-#ifdef REMOVE_CM
-        velocity.x             -= sCM[0].x;
-        velocity.y             -= sCM[0].y;
-        velocity.z             -= sCM[0].z;
-#endif
-        cSim.pOldPosq[pos]      = cSim.pPosq[pos];
-        cSim.pVelm4[pos]        = velocity;
-        pos                    += blockDim.x * gridDim.x;
-    }
-}
-__global__
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
-#endif
-#ifdef REMOVE_CM
-void kLangevinUpdatePart2CM_kernel()
-#else
-void kLangevinUpdatePart2_kernel()
-#endif
-{
-    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
-    unsigned int rpos           = cSim.pRandomPosition[blockIdx.x];
-    __shared__ float dt;
-    if (threadIdx.x == 0)
-    {
-        dt = cSim.pStepSize[0].y;
-        if (pos == 0)
-            cSim.pStepSize[0].x = dt;
-    }
-    __syncthreads();
-#ifdef REMOVE_CM
-    extern __shared__ float3 sCM[];
-    float3 CM                   = {0.0f, 0.0f, 0.0f};
-    __syncthreads();
-#endif
-    while (pos < cSim.atoms)
-    {
-        float4 velocity         = cSim.pVelm4[pos];
-#ifdef REMOVE_CM
-        float mass              = 1.0f / velocity.w;
-        CM.x                   += mass * velocity.x;
-        CM.y                   += mass * velocity.y;
-        CM.z                   += mass * velocity.z;
-#endif
-        float4 xPrime = make_float4(dt*velocity.x, dt*velocity.y, dt*velocity.z, cSim.pPosq[pos].w);
-        cSim.pPosqP[pos]        = xPrime;
-        pos                    += blockDim.x * gridDim.x;
-    }
-    // Update random position pointer
-    if (threadIdx.x == 0)
-    {
-        rpos                   += cSim.paddedNumberOfAtoms;
-        if (rpos > cSim.randoms)
-            rpos               -= cSim.randoms;
-        cSim.pRandomPosition[blockIdx.x] = rpos;
-    }
-#ifdef REMOVE_CM
-    // Scale CM
-    CM.x *= cSim.inverseTotalMass;
-    CM.y *= cSim.inverseTotalMass;
-    CM.z *= cSim.inverseTotalMass;
-    sCM[threadIdx.x] = CM;
-    __syncthreads();
-    // Reduce CM for CTA
-    unsigned int offset = 1;
-    unsigned int mask   = 1;
-    while (offset < blockDim.x)
-    {
-        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
-        {
-            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
-            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
-            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
-        }
-        mask = 2 * mask + 1;
-        offset *= 2;
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-    {
-        float4 CM;
-        CM.x                                = sCM[0].x;
-        CM.y                                = sCM[0].y;
-        CM.z                                = sCM[0].z;
-        CM.w                                = 0.0f;
-        cSim.pLinearMomentum[blockIdx.x]    = CM;
-    }
-#endif
-}
--- a/platforms/cuda-old/src/kernels/kMonteCarloBarostat.cu
+++ b/platforms/cuda-old/src/kernels/kMonteCarloBarostat.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2010 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-//#include <fstream>
-using namespace std;
-#include "gputypes.h"
-__global__ void kScaleAtomCoordinates_kernel(float scale, int numMolecules, float3 periodicBoxSize, float4* posq, int* moleculeAtoms, int* moleculeStartIndex) {
-    float3 invPeriodicBoxSize = make_float3(1.0f/periodicBoxSize.x, 1.0f/periodicBoxSize.y, 1.0f/periodicBoxSize.z);
-    for (int index = threadIdx.x+blockIdx.x*blockDim.x; index < numMolecules; index += blockDim.x*gridDim.x) {
-        int first = moleculeStartIndex[index];
-        int last = moleculeStartIndex[index+1];
-        int numAtoms = last-first;
-        // Find the center of each molecule.
-        float3 center = make_float3(0, 0, 0);
-        for (int atom = first; atom < last; atom++) {
-            float4 pos = posq[moleculeAtoms[atom]];
-            center.x += pos.x;
-            center.y += pos.y;
-            center.z += pos.z;
-        }
-        center.x /= (float) numAtoms;
-        center.y /= (float) numAtoms;
-        center.z /= (float) numAtoms;
-        // Move it into the first periodic box.
-        int xcell = (int) floorf(center.x*invPeriodicBoxSize.x);
-        int ycell = (int) floorf(center.y*invPeriodicBoxSize.y);
-        int zcell = (int) floorf(center.z*invPeriodicBoxSize.z);
-        float3 delta = make_float3(xcell*periodicBoxSize.x, ycell*periodicBoxSize.y, zcell*periodicBoxSize.z);
-        center.x -= delta.x;
-        center.y -= delta.y;
-        center.z -= delta.z;
-        // Now scale the position of the molecule center.
-        delta.x = center.x*(scale-1)-delta.x;
-        delta.y = center.y*(scale-1)-delta.y;
-        delta.z = center.z*(scale-1)-delta.z;
-        for (int atom = first; atom < last; atom++) {
-            float4 pos = posq[moleculeAtoms[atom]];
-            pos.x += delta.x;
-            pos.y += delta.y;
-            pos.z += delta.z;
-            posq[moleculeAtoms[atom]] = pos;
-        }
-    }
-}
-void kScaleAtomCoordinates(gpuContext gpu, float scale, CUDAStream<int>& moleculeAtoms, CUDAStream<int>& moleculeStartIndex)
-{
-//    printf("kScaleAtomCoordinates\n");
-    kScaleAtomCoordinates_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(scale, moleculeStartIndex._length-1,
-            make_float3(gpu->sim.periodicBoxSizeX, gpu->sim.periodicBoxSizeY, gpu->sim.periodicBoxSizeZ), gpu->sim.pPosq,
-            moleculeAtoms._pDevData, moleculeStartIndex._pDevData);
-    LAUNCHERROR("kScaleAtomCoordinates");
-}
--- a/platforms/cuda-old/src/kernels/kRandom.cu
+++ b/platforms/cuda-old/src/kernels/kRandom.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-#include "gputypes.h"
-static __constant__ cudaGmxSimulation cSim;
-void SetRandomSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-void GetRandomSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-extern __shared__ float3 sRand[];
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_RANDOM_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_RANDOM_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_RANDOM_THREADS_PER_BLOCK, 1)
-#endif
-void kGenerateRandoms_kernel()
-{
-    unsigned int pos            = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int increment      = blockDim.x * gridDim.x;
-    // Read generator state
-    uint4 state                 = cSim.pRandomSeed[pos];
-    unsigned int carry          = 0;
-    float4 random4;
-    float2 random2;
-    while (pos < cSim.totalRandoms)
-    {
-        // Generate 6 randoms in GRF
-        unsigned int pos1       = threadIdx.x;
-        for (int i = 0; i < 2; i++)
-        {
-            state.x             = state.x * 69069 + 1;
-            state.y            ^= state.y << 13;
-            state.y            ^= state.y >> 17;
-            state.y            ^= state.y << 5;
-            unsigned int k      = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-            unsigned int m      = state.w + state.w + state.z + carry;
-            state.z             = state.w;
-            state.w             = m;
-            carry               = k >> 30;
-            float x1            = (float)max(state.x + state.y + state.w, 0x00000001) / (float)0xffffffff;
-            state.x             = state.x * 69069 + 1;
-            state.y            ^= state.y << 13;
-            state.y            ^= state.y >> 17;
-            state.y            ^= state.y << 5;
-            x1                  = sqrtf(-2.0f * logf(x1));
-            k                   = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-            m                   = state.w + state.w + state.z + carry;
-            state.z             = state.w;
-            state.w             = m;
-            carry               = k >> 30;
-            float x2            = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-            state.x             = state.x * 69069 + 1;
-            state.y            ^= state.y << 13;
-            state.y            ^= state.y >> 17;
-            state.y            ^= state.y << 5;
-            sRand[pos1].x       = x1 * cosf(2.0f * 3.14159265f * x2);
-            k                   = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-            m                   = state.w + state.w + state.z + carry;
-            state.z             = state.w;
-            state.w             = m;
-            carry               = k >> 30;
-            float x3            = (float)max(state.x + state.y + state.w, 0x00000001) / (float)0xffffffff;
-            state.x             = state.x * 69069 + 1;
-            state.y            ^= state.y << 13;
-            state.y            ^= state.y >> 17;
-            state.y            ^= state.y << 5;
-            x3                  = sqrtf(-2.0f * logf(x3));
-            k                   = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-            m                   = state.w + state.w + state.z + carry;
-            state.z             = state.w;
-            state.w             = m;
-            carry               = k >> 30;
-            float x4            = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-            state.x             = state.x * 69069 + 1;
-            state.y            ^= state.y << 13;
-            state.y            ^= state.y >> 17;
-            state.y            ^= state.y << 5;
-            sRand[pos1].y       = x3 * cosf(2.0f * 3.14159265f * x4);
-            k                   = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-            m                   = state.w + state.w + state.z + carry;
-            state.z             = state.w;
-            state.w             = m;
-            carry               = k >> 30;
-            float x5            = (float)max(state.x + state.y + state.w, 0x00000001) / (float)0xffffffff;
-            state.x             = state.x * 69069 + 1;
-            state.y            ^= state.y << 13;
-            state.y            ^= state.y >> 17;
-            state.y            ^= state.y << 5;
-            x5                  = sqrtf(-2.0f * logf(x5));
-            k                   = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-            m                   = state.w + state.w + state.z + carry;
-            state.z             = state.w;
-            state.w             = m;
-            carry               = k >> 30;
-            float x6            = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-            sRand[pos1].z       = x5 * cosf(2.0f * 3.14159265f * x6); 
-            pos1               += blockDim.x;
-        }
-        // Output final randoms
-        random4.x               = sRand[threadIdx.x].x;
-        random4.y               = sRand[threadIdx.x].y;
-        random4.z               = sRand[threadIdx.x].z;
-        random4.w               = sRand[threadIdx.x + blockDim.x].x;
-        cSim.pRandom4[pos]     = random4;
-        random2.x               = sRand[threadIdx.x + blockDim.x].y;
-        random2.y               = sRand[threadIdx.x + blockDim.x].z;
-        cSim.pRandom2[pos]     = random2;
-        pos += increment;
-    }
-    // Write generator state
-    pos                     = blockIdx.x * blockDim.x + threadIdx.x;
-    cSim.pRandomSeed[pos]   = state;
-}
-void kGenerateRandoms(gpuContext gpu)
-{
-    kGenerateRandoms_kernel<<<gpu->sim.blocks, gpu->sim.random_threads_per_block, gpu->sim.random_threads_per_block * 2 * sizeof(float3)>>>();
-}
--- a/platforms/cuda-old/src/kernels/kSettle.cu
+++ b/platforms/cuda-old/src/kernels/kSettle.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-//#include <fstream>
-using namespace std;
-#include "gputypes.h"
-static __constant__ cudaGmxSimulation cSim;
-void SetSettleSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-void GetSettleSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-/**
- * This is based on the setlep FORTRAN routine by Shuichi Miyamoto.  See
- * S. Miyamoto and P. Kollman, J. Comp. Chem., vol 13, no. 8, pp. 952-962 (1992).
- */
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
-#endif
-void kApplySettle_kernel()
-{
-    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
-    while (pos < cSim.settleConstraints)
-    {
-        // Load data.
-        int4 atomID         = cSim.pSettleID[pos];
-        float2 params       = cSim.pSettleParameter[pos];
-        float4 apos0        = cSim.pOldPosq[atomID.x];
-        float4 xp0          = cSim.pPosqP[atomID.x];
-        float4 apos1        = cSim.pOldPosq[atomID.y];
-        float4 xp1          = cSim.pPosqP[atomID.y];
-        float4 apos2        = cSim.pOldPosq[atomID.z];
-        float4 xp2          = cSim.pPosqP[atomID.z];
-        float m0            = 1.0f/cSim.pVelm4[atomID.x].w;
-        float m1            = 1.0f/cSim.pVelm4[atomID.y].w;
-        float m2            = 1.0f/cSim.pVelm4[atomID.z].w;
-        // Translate the molecule to the origin to improve numerical precision.
-        float3 center = make_float3(apos0.x, apos0.y, apos0.z);
-        apos0.x -= center.x;
-        apos0.y -= center.y;
-        apos0.z -= center.z;
-        apos1.x -= center.x;
-        apos1.y -= center.y;
-        apos1.z -= center.z;
-        apos2.x -= center.x;
-        apos2.y -= center.y;
-        apos2.z -= center.z;
-        // Apply the SETTLE algorithm.
-        float xb0 = apos1.x-apos0.x;
-        float yb0 = apos1.y-apos0.y;
-        float zb0 = apos1.z-apos0.z;
-        float xc0 = apos2.x-apos0.x;
-        float yc0 = apos2.y-apos0.y;
-        float zc0 = apos2.z-apos0.z;
-        float totalMass = m0+m1+m2;
-        float xcom = ((apos0.x+xp0.x)*m0 + (apos1.x+xp1.x)*m1 + (apos2.x+xp2.x)*m2) / totalMass;
-        float ycom = ((apos0.y+xp0.y)*m0 + (apos1.y+xp1.y)*m1 + (apos2.y+xp2.y)*m2) / totalMass;
-        float zcom = ((apos0.z+xp0.z)*m0 + (apos1.z+xp1.z)*m1 + (apos2.z+xp2.z)*m2) / totalMass;
-        float xa1 = apos0.x + xp0.x - xcom;
-        float ya1 = apos0.y + xp0.y - ycom;
-        float za1 = apos0.z + xp0.z - zcom;
-        float xb1 = apos1.x + xp1.x - xcom;
-        float yb1 = apos1.y + xp1.y - ycom;
-        float zb1 = apos1.z + xp1.z - zcom;
-        float xc1 = apos2.x + xp2.x - xcom;
-        float yc1 = apos2.y + xp2.y - ycom;
-        float zc1 = apos2.z + xp2.z - zcom;
-        float xaksZd = yb0*zc0 - zb0*yc0;
-        float yaksZd = zb0*xc0 - xb0*zc0;
-        float zaksZd = xb0*yc0 - yb0*xc0;
-        float xaksXd = ya1*zaksZd - za1*yaksZd;
-        float yaksXd = za1*xaksZd - xa1*zaksZd;
-        float zaksXd = xa1*yaksZd - ya1*xaksZd;
-        float xaksYd = yaksZd*zaksXd - zaksZd*yaksXd;
-        float yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
-        float zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
-        float axlng = sqrtf(xaksXd * xaksXd + yaksXd * yaksXd + zaksXd * zaksXd);
-        float aylng = sqrtf(xaksYd * xaksYd + yaksYd * yaksYd + zaksYd * zaksYd);
-        float azlng = sqrtf(xaksZd * xaksZd + yaksZd * yaksZd + zaksZd * zaksZd);
-        float trns11 = xaksXd / axlng;
-        float trns21 = yaksXd / axlng;
-        float trns31 = zaksXd / axlng;
-        float trns12 = xaksYd / aylng;
-        float trns22 = yaksYd / aylng;
-        float trns32 = zaksYd / aylng;
-        float trns13 = xaksZd / azlng;
-        float trns23 = yaksZd / azlng;
-        float trns33 = zaksZd / azlng;
-        float xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0;
-        float yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0;
-        float xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0;
-        float yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0;
-        float za1d = trns13*xa1 + trns23*ya1 + trns33*za1;
-        float  xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1;
-        float  yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1;
-        float  zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1;
-        float  xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1;
-        float  yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1;
-        float  zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1;
-        //                                        --- Step2  A2' ---
-        float rc = 0.5f*params.y;
-        float rb = sqrtf(params.x*params.x-rc*rc);
-        float ra = rb*(m1+m2)/totalMass;
-        rb -= ra;
-        float sinphi = za1d / ra;
-        float cosphi = sqrtf(1.0f - sinphi*sinphi);
-        float sinpsi = ( zb1d - zc1d ) / (2*rc*cosphi);
-        float cospsi = sqrtf(1.0f - sinpsi*sinpsi);
-        float ya2d =   ra * cosphi;
-        float xb2d = - rc * cospsi;
-        float yb2d = - rb * cosphi - rc *sinpsi * sinphi;
-        float yc2d = - rb * cosphi + rc *sinpsi * sinphi;
-        float xb2d2 = xb2d * xb2d;
-        float hh2 = 4.0f * xb2d2 + (yb2d-yc2d) * (yb2d-yc2d) + (zb1d-zc1d) * (zb1d-zc1d);
-        float deltx = 2.0f * xb2d + sqrtf( 4.0f * xb2d2 - hh2 + params.y*params.y );
-        xb2d -= deltx * 0.5f;
-        //                                        --- Step3  al,be,ga ---
-        float alpa = ( xb2d * (xb0d-xc0d) + yb0d * yb2d + yc0d * yc2d );
-        float beta = ( xb2d * (yc0d-yb0d) + xb0d * yb2d + xc0d * yc2d );
-        float gama = xb0d * yb1d - xb1d * yb0d + xc0d * yc1d - xc1d * yc0d;
-        float al2be2 = alpa * alpa + beta * beta;
-        float sinthe = ( alpa*gama - beta * sqrtf( al2be2 - gama * gama ) ) / al2be2;
-        //                                        --- Step4  A3' ---
-        float costhe = sqrtf(1.0f - sinthe * sinthe );
-        float xa3d = - ya2d * sinthe;
-        float ya3d =   ya2d * costhe;
-        float za3d = za1d;
-        float xb3d =   xb2d * costhe - yb2d * sinthe;
-        float yb3d =   xb2d * sinthe + yb2d * costhe;
-        float zb3d = zb1d;
-        float xc3d = - xb2d * costhe - yc2d * sinthe;
-        float yc3d = - xb2d * sinthe + yc2d * costhe;
-        float zc3d = zc1d;
-        //                                        --- Step5  A3 ---
-        float xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d;
-        float ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d;
-        float za3 = trns31*xa3d + trns32*ya3d + trns33*za3d;
-        float xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d;
-        float yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d;
-        float zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d;
-        float xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d;
-        float yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d;
-        float zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d;
-        xp0.x = xcom + xa3 - apos0.x;
-        xp0.y = ycom + ya3 - apos0.y;
-        xp0.z = zcom + za3 - apos0.z;
-        xp1.x = xcom + xb3 - apos1.x;
-        xp1.y = ycom + yb3 - apos1.y;
-        xp1.z = zcom + zb3 - apos1.z;
-        xp2.x = xcom + xc3 - apos2.x;
-        xp2.y = ycom + yc3 - apos2.y;
-        xp2.z = zcom + zc3 - apos2.z;
-        cSim.pPosqP[atomID.x] = xp0;
-        cSim.pPosqP[atomID.y] = xp1;
-        cSim.pPosqP[atomID.z] = xp2;
-        pos += blockDim.x * gridDim.x;
-    }
-}
-void kApplySettle(gpuContext gpu)
-{
-//    printf("kApplySettle\n");
-    if (gpu->sim.settleConstraints > 0)
-    {
-        kApplySettle_kernel<<<gpu->sim.blocks, gpu->sim.settle_threads_per_block>>>();
-        LAUNCHERROR("kApplySettle");
-    }
-}
--- a/platforms/cuda-old/src/kernels/kShakeH.cu
+++ b/platforms/cuda-old/src/kernels/kShakeH.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-using namespace std;
-#include "gputypes.h"
-struct Atom
-{
-    float3 rij1;
-    float3 rij2;
-    float3 rij3;
-    float  M;
-    float  d2;
-    float  InvMassI;
-    float  rij1sq;
-    float  rij2sq;
-    float  rij3sq;
-};
-static __constant__ cudaGmxSimulation cSim;
-void SetShakeHSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-void GetShakeHSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
-#endif
-void kApplyShake_kernel()
-{
-    extern __shared__ Atom sA[];
-    Atom* psA = &sA[threadIdx.x];
-    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
-    while (pos < cSim.ShakeConstraints)
-    {
-        int4 atomID         = cSim.pShakeID[pos];
-        float4 params       = cSim.pShakeParameter[pos];
-        float4 apos         = cSim.pOldPosq[atomID.x];
-        float4 xpi          = cSim.pPosqP[atomID.x];
-        float4 apos1        = cSim.pOldPosq[atomID.y];
-        float4 xpj1         = cSim.pPosqP[atomID.y];
-        float4 apos2        = {0.0f, 0.0f, 0.0f, 0.0f};
-        float4 xpj2         = {0.0f, 0.0f, 0.0f, 0.0f};
-        psA->InvMassI       = params.x;
-        psA->M              = params.y;
-        psA->d2             = params.z;
-        float invMassJ      = params.w;
-        if (atomID.z != -1)
-        {
-            apos2           = cSim.pOldPosq[atomID.z];
-            xpj2            = cSim.pPosqP[atomID.z];
-        }
-        float4 apos3        = {0.0f, 0.0f, 0.0f, 0.0f};
-        float4 xpj3         = {0.0f, 0.0f, 0.0f, 0.0f};
-        if (atomID.w != -1)
-        {
-            apos3           = cSim.pOldPosq[atomID.w];
-            xpj3            = cSim.pPosqP[atomID.w];
-        }
-        float3 xi, xj1, xj2, xj3;
-        xi.x                = apos.x;
-        xi.y                = apos.y;
-        xi.z                = apos.z;
-        xj1.x               = apos1.x;
-        xj1.y               = apos1.y;
-        xj1.z               = apos1.z;
-        xj2.x               = apos2.x;
-        xj2.y               = apos2.y;
-        xj2.z               = apos2.z;
-        xj3.x               = apos3.x;
-        xj3.y               = apos3.y;
-        xj3.z               = apos3.z;
-        psA->rij1.x         = xi.x - xj1.x;
-        psA->rij1.y         = xi.y - xj1.y;
-        psA->rij1.z         = xi.z - xj1.z;
-        psA->rij2.x         = xi.x - xj2.x;
-        psA->rij2.y         = xi.y - xj2.y;
-        psA->rij2.z         = xi.z - xj2.z;
-        psA->rij3.x         = xi.x - xj3.x;
-        psA->rij3.y         = xi.y - xj3.y;
-        psA->rij3.z         = xi.z - xj3.z;
-        psA->rij1sq         = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
-        psA->rij2sq         = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
-        psA->rij3sq         = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
-        float ld1           = psA->d2 - psA->rij1sq;
-        float ld2           = psA->d2 - psA->rij2sq;
-        float ld3           = psA->d2 - psA->rij3sq;
-        bool converged = false;
-        int iteration = 0;
-        while (iteration < 15 && !converged)
-        {
-            converged = true;
-            float3 rpij;
-            rpij.x          = xpi.x - xpj1.x;
-            rpij.y          = xpi.y - xpj1.y;
-            rpij.z          = xpi.z - xpj1.z;
-		    float rpsqij    = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-		    float rrpr      = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z;
-		    float diff      = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
-            if (diff >= 1.0f)
-            {
-                float acor  = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
-                float3 dr;
-                dr.x    = psA->rij1.x * acor;
-                dr.y    = psA->rij1.y * acor;
-                dr.z    = psA->rij1.z * acor;
-		        xpi.x  += dr.x * psA->InvMassI;
-		        xpi.y  += dr.y * psA->InvMassI;
-		        xpi.z  += dr.z * psA->InvMassI;
-		        xpj1.x -= dr.x * invMassJ;
-		        xpj1.y -= dr.y * invMassJ;
-		        xpj1.z -= dr.z * invMassJ;
-                converged = false;
-            }
-            if (atomID.z != -1)
-            {
-                rpij.x          = xpi.x - xpj2.x;
-                rpij.y          = xpi.y - xpj2.y;
-                rpij.z          = xpi.z - xpj2.z;
-		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-		        rrpr            = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z;
-		        diff            = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
-                if (diff >= 1.0f)
-                {
-                    float acor  = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
-                    float3 dr;
-                    dr.x    = psA->rij2.x * acor;
-                    dr.y    = psA->rij2.y * acor;
-                    dr.z    = psA->rij2.z * acor;
-		            xpi.x  += dr.x * psA->InvMassI;
-		            xpi.y  += dr.y * psA->InvMassI;
-		            xpi.z  += dr.z * psA->InvMassI;
-		            xpj2.x -= dr.x * invMassJ;
-		            xpj2.y -= dr.y * invMassJ;
-		            xpj2.z -= dr.z * invMassJ;
-                    converged = false;
-                }
-            }
-            if (atomID.w != -1)
-            {
-                rpij.x          = xpi.x - xpj3.x;
-                rpij.y          = xpi.y - xpj3.y;
-                rpij.z          = xpi.z - xpj3.z;
-		        rpsqij          = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
-		        rrpr            = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z;
-		        diff            = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
-                if (diff >= 1.0f)
-                {
-                    float acor  = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
-                    float3 dr;
-                    dr.x    = psA->rij3.x * acor;
-                    dr.y    = psA->rij3.y * acor;
-                    dr.z    = psA->rij3.z * acor;
-		            xpi.x  += dr.x * psA->InvMassI;
-		            xpi.y  += dr.y * psA->InvMassI;
-		            xpi.z  += dr.z * psA->InvMassI;
-		            xpj3.x -= dr.x * invMassJ;
-		            xpj3.y -= dr.y * invMassJ;
-		            xpj3.z -= dr.z * invMassJ;
-                    converged = false;
-                }
-            }
-            iteration++;
-        }
-        cSim.pPosqP[atomID.x] = xpi;
-        cSim.pPosqP[atomID.y] = xpj1;
-        if (atomID.z != -1)
-            cSim.pPosqP[atomID.z] = xpj2;
-        if (atomID.w != -1)
-            cSim.pPosqP[atomID.w] = xpj3;
-        pos += blockDim.x * gridDim.x;
-    }
-}
-void kApplyShake(gpuContext gpu)
-{
-//    printf("kApplyShake\n");
-    if (gpu->sim.ShakeConstraints > 0)
-    {
-        kApplyShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block, sizeof(Atom)*gpu->sim.shake_threads_per_block>>>();
-        LAUNCHERROR("kApplyShake");
-    }
-}
--- a/platforms/cuda-old/src/kernels/kVerletUpdate.cu
+++ b/platforms/cuda-old/src/kernels/kVerletUpdate.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-using namespace std;
-#include "gputypes.h"
-static __constant__ cudaGmxSimulation cSim;
-void SetVerletUpdateSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-void GetVerletUpdateSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-// Include versions of the kernels with and with center of mass motion removal.
-#include "kVerletUpdate.h"
-#define REMOVE_CM
-#include "kVerletUpdate.h"
-void kVerletUpdatePart1(gpuContext gpu)
-{
-//    printf("kVerletUpdatePart1\n");
-    if (gpu->bRemoveCM)
-    {
-        kVerletUpdatePart1CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
-        LAUNCHERROR("kVerletUpdatePart1CM");
-        gpu->bRemoveCM = false;
-    }
-    else
-    {    
-        kVerletUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
-        LAUNCHERROR("kVerletUpdatePart1");
-    }
-}
-void kVerletUpdatePart2(gpuContext gpu)
-{
-//    printf("kVerletUpdatePart2\n");
-    if (gpu->bCalculateCM)
-    {
-        kVerletUpdatePart2CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
-        LAUNCHERROR("kVerletUpdatePart2CM");
-        gpu->bCalculateCM = false;
-        gpu->bRemoveCM = true;
-    }
-    else
-    {
-        kVerletUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
-        LAUNCHERROR("kVerletUpdatePart2");
-    }
-}
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
-#endif
-void kSelectVerletStepSize_kernel(float maxStepSize)
-{
-    // Calculate the error.
-    extern __shared__ float error[];
-    error[threadIdx.x] = 0.0f;
-    unsigned int pos = threadIdx.x;
-    while (pos < cSim.atoms)
-    {
-        float4 force  = cSim.pForce4[pos];
-        float invMass = cSim.pVelm4[pos].w;
-        error[threadIdx.x] += (force.x*force.x + force.y*force.y + force.z*force.z)*invMass;
-        pos += blockDim.x * gridDim.x;
-    }
-    __syncthreads();
-    // Sum the errors from all threads.
-    for (int offset = 1; offset < blockDim.x; offset *= 2)
-    {
-        if (threadIdx.x+offset < blockDim.x && (threadIdx.x&(2*offset-1)) == 0)
-            error[threadIdx.x] += error[threadIdx.x+offset];
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-    {
-        float totalError = sqrt(error[0]/(cSim.atoms*3));
-        float newStepSize = sqrt(cSim.errorTol/totalError);
-        float oldStepSize = cSim.pStepSize[0].y;
-        if (oldStepSize > 0.0f)
-            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
-        if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
-            newStepSize = oldStepSize; // Keeping dt constant between steps improves the behavior of the integrator.
-        if (newStepSize > maxStepSize)
-            newStepSize = maxStepSize;
-        cSim.pStepSize[0].y = newStepSize;
-    }
-}
-void kSelectVerletStepSize(gpuContext gpu, float maxTimeStep)
-{
-//    printf("kSelectVerletStepSize\n");
-    kSelectVerletStepSize_kernel<<<1, gpu->sim.update_threads_per_block, sizeof(float)*gpu->sim.update_threads_per_block>>>(maxTimeStep);
-    LAUNCHERROR("kSelectVerletStepSize");
-}
--- a/platforms/cuda-old/src/kernels/kVerletUpdate.h
+++ b/platforms/cuda-old/src/kernels/kVerletUpdate.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-/**
- * This file contains the kernels for Verlet integration.  It is included
- * several times in kVerletUpdate.cu with different #defines to generate
- * different versions of the kernels.
- */
-__global__
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
-#endif
-#ifdef REMOVE_CM
-void kVerletUpdatePart1CM_kernel()
-#else
-void kVerletUpdatePart1_kernel()
-#endif
-{
-    // Load the step size to take.
-    __shared__ volatile float dtPos;
-    __shared__ volatile float dtVel;
-    if (threadIdx.x == 0)
-    {
-        float2 stepSize = cSim.pStepSize[0];
-        dtPos = stepSize.y;
-        dtVel = 0.5f*(stepSize.x+stepSize.y);
-    }
-    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
-#ifdef REMOVE_CM
-    extern __shared__ float3 sCM[];
-    float3 CM           = { 0.0f, 0.0f, 0.0f};
-    float4 CM1          = { 0.0f, 0.0f, 0.0f, 0.0f };
-    // Read CM outputs from previous step
-    unsigned int cpos = threadIdx.x;
-    while (cpos < gridDim.x)
-    {
-        CM1             = cSim.pLinearMomentum[cpos];
-        CM.x           += CM1.x;
-        CM.y           += CM1.y;
-        CM.z           += CM1.z;
-        cpos           += blockDim.x;
-    }
-    sCM[threadIdx.x].x  = CM.x;
-    sCM[threadIdx.x].y  = CM.y;
-    sCM[threadIdx.x].z  = CM.z;
-    __syncthreads();
-    // Reduce CM
-    unsigned int offset = 1;
-    unsigned int mask   = 1;
-    while (offset < blockDim.x)
-    {
-        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
-        {
-            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
-            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
-            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
-        }
-        mask = 2 * mask + 1;
-        offset *= 2;
-        __syncthreads();
-    }
-#else
-    __syncthreads();
-#endif
-    while (pos < cSim.atoms)
-    {
-        float4 apos             = cSim.pPosq[pos];
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 force            = cSim.pForce4[pos];
-        float dtOverMass        = dtVel*velocity.w;
-        cSim.pOldPosq[pos]      = apos;
-        velocity.x             += dtOverMass*force.x;
-        velocity.y             += dtOverMass*force.y;
-        velocity.z             += dtOverMass*force.z;
-#ifdef REMOVE_CM
-        velocity.x             -= sCM[0].x;
-        velocity.y             -= sCM[0].y;
-        velocity.z             -= sCM[0].z;
-#endif
-        apos.x                  = velocity.x*dtPos;
-        apos.y                  = velocity.y*dtPos;
-        apos.z                  = velocity.z*dtPos;
-        cSim.pPosqP[pos]        = apos;
-        cSim.pVelm4[pos]        = velocity;
-        pos                    += blockDim.x * gridDim.x;
-    }
-}
-__global__
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
-#endif
-#ifdef REMOVE_CM
-void kVerletUpdatePart2CM_kernel()
-#else
-void kVerletUpdatePart2_kernel()
-#endif
-{
-    // Load the step size to take.
-    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
-    __shared__ double oneOverDeltaT;
-    if (threadIdx.x == 0)
-    {
-        float dt = cSim.pStepSize[0].y;
-        oneOverDeltaT = 1.0/dt;
-        if (pos == 0)
-            cSim.pStepSize[0].x = dt;
-    }
-    __syncthreads();
-#ifdef REMOVE_CM
-    extern __shared__ float3 sCM[];
-    float3 CM                   = {0.0f, 0.0f, 0.0f};
-#endif
-    while (pos < cSim.atoms)
-    {
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 apos             = cSim.pPosq[pos];
-        float4 xPrime           = cSim.pPosqP[pos];
-        velocity.x              = (float) (oneOverDeltaT*(double)xPrime.x);
-        velocity.y              = (float) (oneOverDeltaT*(double)xPrime.y);
-        velocity.z              = (float) (oneOverDeltaT*(double)xPrime.z);
-        xPrime.x               += apos.x;
-        xPrime.y               += apos.y;
-        xPrime.z               += apos.z;
-#ifdef REMOVE_CM
-        float mass              = 1.0f / velocity.w;
-        CM.x                   += mass * velocity.x;
-        CM.y                   += mass * velocity.y;
-        CM.z                   += mass * velocity.z;
-#endif
-        cSim.pPosq[pos]         = xPrime;
-        cSim.pVelm4[pos]        = velocity;
-        pos                    += blockDim.x * gridDim.x;
-    }
-#ifdef REMOVE_CM
-    // Scale CM
-    CM.x *= cSim.inverseTotalMass;
-    CM.y *= cSim.inverseTotalMass;
-    CM.z *= cSim.inverseTotalMass;
-    sCM[threadIdx.x] = CM;
-    __syncthreads();
-    // Reduce CM for CTA
-    unsigned int offset = 1;
-    unsigned int mask   = 1;
-    while (offset < blockDim.x)
-    {
-        if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
-        {
-            sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
-            sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
-            sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
-        }
-        mask = 2 * mask + 1;
-        offset *= 2;
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-    {
-        float4 CM;
-        CM.x                                = sCM[0].x;
-        CM.y                                = sCM[0].y;
-        CM.z                                = sCM[0].z;
-        CM.w                                = 0.0f;
-        cSim.pLinearMomentum[blockIdx.x]    = CM;
-    }
-#endif
-}
--- a/platforms/cuda-old/src/kernels/rng.cpp
+++ b/platforms/cuda-old/src/kernels/rng.cpp
-//________________________________________________________________________
-// See the header file rng.h for a description of the contents of this
-// file as well as references and credits.
-#include "rng.h"
-#include <cmath>
-#include <iostream>
-inline ulong32 ULONG32(slong x) { return (ulong32(x)); }
-inline ulong32 ULONG32(ulong32 x) { return (ulong32(x)); }
-inline ulong32 ULONG32(double x) { return (ulong32(x)); }
-static const double PI   =  3.1415926535897932;
-//________________________________________________________________________
-// Initialize the static component of RNG
-ulong32 RNG::tm = 1234567;
-ulong32 RNG::kn[128], RNG::ke[256];
-double RNG::wn[128], RNG::fn[128], RNG::we[256], RNG::fe[256];
-//________________________________________________________________________
-// RNG::RNOR generates normal variates with rejection.
-// nfix() generates variates after rejection in RNOR.
-// Despite rejection, this method is faster than Box-Muller.
-double RNG::nfix(slong h, ulong32 i)
-{
-  const double r = 3.442620; 	// The starting of the right tail
-  double x, y;
-  for(;;) {
-    x = h * wn[i];
-    // If i == 0, handle the base strip
-    if (i == 0) {
-      do {
-	x = -log(rand_open01()) * 0.2904764;   // .2904764 is 1/r
-	y = -log(rand_open01());
-      } while (y + y < x * x);
-      return ((h > 0) ? r + x : -r - x);
-    }
-    // If i > 0, handle the wedges of other strips
-    if (fn[i] + rand_open01() * (fn[i - 1] - fn[i]) < exp(-.5 * x * x))
-      return x;
-    // start all over
-    h = UL32toSL32(rand_int32());
-    i = h & 127;
-    if (ULONG32(std::abs(h)) < kn[i])
-      return (h * wn[i]);
-  }
-} // RNG::nfix
-// __________________________________________________________________________
-// RNG::REXP generates exponential variates with rejection.
-// efix() generates variates after rejection in REXP.
-double RNG::efix(ulong32 j, ulong32 i)
-{
-  for (;;) {
-    if (i == 0)
-      return (7.69711 - log(rand_open01()));
-    const double x = j * we[i];
-    if (fe[i] + rand_open01() * (fe[i - 1] - fe[i]) < exp(-x)) 
-      return x;
-    j = rand_int32();
-    i = (j & 255);
-    if (j < ke[i])
-      return (j * we[i]);	
-  }
-} // RNG::efix
-// __________________________________________________________________________
-// This procedure creates the tables used by RNOR and REXP
-void RNG::zigset()
-{
-  static bool inited = 0;
-  if (inited)
-    return;
-  inited = 1;
-  // Set up tables for RNOR
-  const double m1 = 2147483648.0; // 2^31
-  const double vn = 9.91256303526217e-3;
-  double tn = 3.442619855899;
-  double q = vn / exp(-.5 * tn * tn);
-  kn[0] = ULONG32((tn / q) * m1); kn[1] = 0;
-  wn[0] = q / m1; wn[127] = tn / m1;
-  fn[0]=1.; fn[127] = exp(-.5 * tn * tn);		
-  for (uint i = 126; i > 0; i--) {
-    const double dn = sqrtf(-2 * log(vn / tn + exp(-.5 * tn * tn)));
-    kn[i + 1] = ULONG32((dn / tn) * m1);
-    fn[i] = exp(-.5 * dn * dn);        
-    wn[i] = dn / m1;
-    tn = dn;
-  }
-  // Set up tables for REXP
-  const double m2 = 4294967296.0; // 2^32
-  const double ve = 3.949659822581572e-3;
-  double te = 7.697117470131487;
-  q = ve / exp(-te);
-  ke[0] = ULONG32((te / q) * m2); ke[1] = 0;
-  we[0] = q / m2; we[255] = te / m2;
-  fe[0] = 1.; fe[255] = exp(-te);		
-  for (uint i = 254; i > 0; i--) {
-    const double de = -log(ve / te + exp(-te));
-    ke[i+1] = ULONG32((de / te) * m2);
-    fe[i] = exp(-de);
-    we[i] = de / m2;
-    te = de;
-  }
-} // RNG::zigset
-// __________________________________________________________________________
-// Generate a gamma variate with parameters 'shape' and 'scale'
-double RNG::gamma(double shape, double scale)
-{
-  if (shape < 1.)
-    return gamma(shape + 1., scale) * pow(rand_open01(), 1.0 / shape);
-  const double d = shape - 1. / 3.;
-  const double c = 1. / sqrtf(9. * d);
-  double x, v;
-  for (;;) {
-    do {
-      x = RNOR();
-      v = 1.0 + c * x;
-    } while (v <= 0.0);
-    v = v * v * v;
-    const double u = rand_open01();
-    const double x2 = x * x;
-    if (u < 1.0 - 0.0331 * x2 * x2)
-      return (d * v / scale);
-    if (log(u) < 0.5 * x2 + d * (1.0 - v + log(v)))
-      return (d * v / scale);
-  }
-} // RNG::gamma
-// __________________________________________________________________________
-// Generate a Poisson variate 
-// Code essentially copied from R source that is essentially
-// ACM Algorithm 599 KPOISS converted to C.
-int RNG::poisson(double mu)
-{
-  const double a0=-0.5, a1= 0.3333333, a2=-0.2500068, a3= 0.2000118,
-    a4=-0.1661269, a5= 0.1421878, a6=-0.1384794, a7= 0.1250060;
-  const double one_7 = 1.0 / 7.0, one_12 = 1.0 / 12.0, one_24 = 1.0 / 24.0;
-  const double fact[10] =
-    { 1., 1., 2., 6., 24., 120., 720., 5040., 40320., 362880.  };
-  static int l, m;
-  static double b1, b2, c, c0, c1, c2, c3;
-  static double pp[36], p0, p, q, s, d, omega;
-  static double big_l;/* integer "w/o overflow" */
-  static double muprev = 0., muprev2 = 0.;/*, muold     = 0.*/
-  double del, difmuk= 0., E= 0., fk= 0., fx, fy, g, px, py, t, u= 0., v, x;
-  int pois = -1;
-  int k, kflag, big_mu, new_big_mu = 0;
-  if (mu <= 0.)
-    return 0;
-  big_mu = mu >= 10.;
-  if(big_mu)
-    new_big_mu = 0;
-  if (!(big_mu && mu == muprev)) {
-    if (big_mu) {
-      new_big_mu = 1;
-      muprev = mu;
-      s = sqrtf(mu);
-      d = 6. * mu * mu;
-      big_l = floor(mu - 1.1484);
-    }
-    else {
-      if (mu != muprev) {
-        muprev = mu;
-        m = (int) ((1.0 < mu) ? mu : 1.0);
-        l = 0; /* pp[] is already ok up to pp[l] */
-        q = p0 = p = exp(-mu);
-      }
-      for (;;) {
-        u = rand_open01();
-        if (u <= p0)
-          return 0;
-        if (l != 0) {
-          for (k = (u <= 0.458) ? 1 : ((l < m) ? l : m);  k <= l; k++)
-            if (u <= pp[k])
-              return k;
-          if (l == 35) /* u > pp[35] */
-            continue;
-        }
-        l++;
-        for (k = l; k <= 35; k++) {
-          p *= mu / k;
-          q += p;
-          pp[k] = q;
-          if (u <= q) {
-            l = k;
-            return k;
-          }
-        }
-        l = 35;
-      }
-    }
-  }
-  g = mu + s * RNOR();
-  if (g >= 0.) {
-    pois = int(g);
-    if (pois >= big_l)
-      return pois;
-    fk = pois;
-    difmuk = mu - fk;
-    u = rand_open01();
-    if (d * u >= difmuk * difmuk * difmuk)
-      return pois;
-  }
-  if (new_big_mu || mu != muprev2) {
-    muprev2 = mu;
-    omega = 1.0 / (sqrtf(2.0 * PI) * s);
-    b1 = one_24 / mu;
-    b2 = 0.3 * b1 * b1;
-    c3 = one_7 * b1 * b2;
-    c2 = b2 - 15. * c3;
-    c1 = b1 - 6. * b2 + 45. * c3;
-    c0 = 1. - b1 + 3. * b2 - 15. * c3;
-    c = 0.1069 / mu;
-  }
-  if (g >= 0.) {
-    kflag = 0;
-    goto Step_F;
-  }
-  for (;;) {
-    E = REXP();
-    u = 2 * rand_open01() - 1.;
-    t = 1.8 + ((u > 0) ? std::abs(E) : -std::abs(E));
-    if (t > -0.6744) {
-      pois = int(mu + s * t);
-      fk = pois;
-      difmuk = mu - fk;
-      kflag = 1;
-Step_F:
-      if (pois < 10) {
-        px = -mu;
-        py = pow(mu, pois) / fact[pois];
-      }
-      else {
-        del = one_12 / fk;
-        del = del * (1. - 4.8 * del * del);
-        v = difmuk / fk;
-        if (std::abs(v) <= 0.25)
-          px = fk * v * v * (((((((a7 * v + a6) * v + a5) * v + a4) *
-                      v + a3) * v + a2) * v + a1) * v + a0)
-            - del;
-        else
-          px = fk * log(1. + v) - difmuk - del;
-        py = 1.0 / (sqrtf(2.0 * PI) * sqrtf(fk));
-      }
-      x = (0.5 - difmuk) / s;
-      x *= x;/* x^2 */
-      fx = -0.5 * x;
-      fy = omega * (((c3 * x + c2) * x + c1) * x + c0);
-      if (kflag > 0) {
-        if (c * std::abs(u) <= py * exp(px + E) - fy * exp(fx + E))
-          break;
-      } else
-        if (fy - u * fy <= py * exp(px - fx))
-          break;
-    }
-  }
-  return pois;
-} // RNG::poisson
-// __________________________________________________________________________
-// Generate a binomial variate 
-// Code essentially copied from R source that is essentially
-// ACM Algorithm 678 BTPEC converted to C.
-int RNG::binomial(double pp, int n)
-{
-  static double c, fm, npq, p1, p2, p3, p4, qn, xl, xll, xlr, xm, xr;
-  static double psave = -1.0;
-  static int nsave = -1, m = 0;
-  double f, x;
-  if (n <= 0 || pp <= 0.) return 0;
-  if (pp >= 1.) return n;
-  const double p = (pp < 1. - pp) ? pp : 1. - pp;
-  const double q = 1. - p;
-  const double np = n * p;
-  const double r = p / q;
-  const double g = r * (n + 1);
-  if (pp != psave || n != nsave) {
-    psave = pp;
-    nsave = n;
-    if (np < 30.0) {
-      qn = pow(q, (double) n);
-      goto L_np_small;
-    } else {
-      double ffm = np + p;
-      m = (int) ffm;
-      fm = m;
-      npq = np * q;
-      p1 = (int)(2.195 * sqrtf(npq) - 4.6 * q) + 0.5;
-      xm = fm + 0.5;
-      xl = xm - p1;
-      xr = xm + p1;
-      c = 0.134 + 20.5 / (15.3 + fm);
-      double al = (ffm - xl) / (ffm - xl * p);
-      xll = al * (1.0 + 0.5 * al);
-      al = (xr - ffm) / (xr * q);
-      xlr = al * (1.0 + 0.5 * al);
-      p2 = p1 * (1.0 + c + c);
-      p3 = p2 + c / xll;
-      p4 = p3 + c / xlr;
-    }
-  } else if (n == nsave) {
-    if (np < 30.0)
-      goto L_np_small;
-  }
-  int i, ix;
-  for (;;) {
-    double u = rand_open01() * p4;
-    double v = rand_open01();
-    if (u <= p1) {
-      ix = (int) (xm - p1 * v + u);
-      goto finish;
-    }
-    if (u <= p2) {
-      x = xl + (u - p1) / c;
-      v = v * c + 1.0 - std::abs(xm - x) / p1;
-      if (v > 1.0 || v <= 0.)
-        continue;
-      ix = (int) x;
-    } else {
-      if (u > p3) {
-        ix = (int) (xr - log(v) / xlr);
-        if (ix > n)
-          continue;
-        v *= (u - p3) * xlr;
-      } else {/* left tail */
-        ix = (int) (xl + log(v) / xll);
-        if (ix < 0)
-          continue;
-        v *= (u - p2) * xll;
-      }
-    }
-    int k = std::abs(ix - m);
-    if (k <= 20 || k >= npq / 2 - 1) {
-      f = 1.0;
-      if (m < ix) {
-        for (i = m + 1; i <= ix; i++)
-          f *= (g / i - r);
-      } else if (m != ix) {
-        for (i = ix + 1; i <= m; i++)
-          f /= (g / i - r);
-      }
-      if (v <= f)
-        goto finish;
-    } else {
-      double amaxp, ynorm, alv;
-      amaxp = (k / npq) * ((k * (k / 3. + 0.625) + 0.1666666666666) /
-               npq + 0.5);
-      ynorm = -k * k / (2.0 * npq);
-      alv = log(v);
-      if (alv < ynorm - amaxp)
-        goto finish;
-      if (alv <= ynorm + amaxp) {
-        double x1, f1, z, w, z2, x2, f2, w2;
-        x1 = ix + 1;
-        f1 = fm + 1.0;
-        z = n + 1 - fm;
-        w = n - ix + 1.0;
-        z2 = z * z;
-        x2 = x1 * x1;
-        f2 = f1 * f1;
-        w2 = w * w;
-        if (alv <= xm * log(f1 / x1) + (n - m + 0.5) * log(z / w) +
-	  (ix - m) * log(w * p / (x1 * q)) + (13860.0 - (462.0 -
-	  (132.0 - (99.0 - 140.0 / f2) / f2) / f2) / f2) / f1 / 166320.0 +
-	  (13860.0 - (462.0 - (132.0 - (99.0 - 140.0 / z2) / z2) / z2) / z2) /
-	  z / 166320.0 + (13860.0 - (462.0 - (132.0 - (99.0 - 140.0 / x2) /
-	  x2) / x2) / x2) / x1 / 166320.0 + (13860.0 - (462.0 - (132.0 -
-	  (99.0 - 140.0 / w2) / w2) / w2) / w2) / w / 166320.)
-          goto finish;
-      }
-    }
-  }
- L_np_small:
-  for (;;) {
-    ix = 0;
-    f = qn;
-    double u = rand_open01();
-    for (;;) {
-      if (u < f)
-        goto finish;
-      if (ix > 110)
-        break;
-      u -= f;
-      ix++;
-      f *= (g / ix - r);
-    }
-  }
- finish:
-  if (psave > 0.5)
-     ix = n - ix;
-  return ix;
-} // RNG::binomial
-// __________________________________________________________________________
-// Generate a sample of size 'n' from a multinomial distribution with
-// probabilities given in 'probs'.  Inspired by R source code.
-void RNG::multinom(uint n, const vector<double>& probs, vector<uint>& samp)
-{
-  samp.resize(probs.size());
-  RNG::multinom(n, &probs[0], (uint) probs.size(), &samp[0]);
-}
-void RNG::multinom(uint size, const double* probs, uint num_probs, uint* samp)
-{
-  if (num_probs == 0) return;
-  for (uint i = 0; i < num_probs; i++) samp[i] = 0;
-  if (size == 0) return;
-  vector<double> fixed_probs(num_probs);
-  double total_prob = 0.;
-  for (uint i = 0; i < num_probs; i++) {
-    const double pp = probs[i];
-    //if (std::isfinite(pp) && pp >= 0)
-    if ((pp == pp) && pp >= 0)
-      total_prob += (fixed_probs[i] = pp);
-  }
-  if (total_prob == 0.) return;
-  for (uint i = 0; i < num_probs-1; i++) {
-    if (fixed_probs[i] > 0.) {
-      samp[i] = binomial(fixed_probs[i] / total_prob, size);
-      size -= samp[i];
-    }
-    if (size == 0) return;
-    total_prob -= fixed_probs[i];
-  }
-  samp[num_probs - 1] = size;
-} // RNG::multinomial
-// __________________________________________________________________________
-// rng.C