Deleted the old CUDA platform

2e451b9d · Peter Eastman · 352e2fc7 · 352e2fc7 · 352e2fc7 · 352e2fc7
Commit 2e451b9d authored Dec 13, 2012 by Peter Eastman
20 changed files
--- a/platforms/cuda-old/src/kernels/kApplyConstraints.cu
+++ b/platforms/cuda-old/src/kernels/kApplyConstraints.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2010 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-//#include <fstream>
-using namespace std;
-
-#include "gputypes.h"
-#include "cudaKernels.h"
-
-__global__ void kPrepareConstraints_kernel(int numAtoms, float4* oldPos, float4* posq, float4* posqP) {
-    for (int index = threadIdx.x+blockIdx.x*blockDim.x; index < numAtoms; index += blockDim.x*gridDim.x) {
-        float4 pos = posq[index];
-        oldPos[index] = pos;
-        posqP[index] = make_float4(0.0f, 0.0f, 0.0f, pos.w);
-    }
-}
-
-__global__ void kFinishConstraints_kernel(int numAtoms, float4* posq, float4* posqP) {
-    for (int index = threadIdx.x+blockIdx.x*blockDim.x; index < numAtoms; index += blockDim.x*gridDim.x) {
-        float4 pos = posq[index];
-        float4 delta = posqP[index];
-        posq[index] = make_float4(pos.x+delta.x, pos.y+delta.y, pos.z+delta.z, pos.w);
-    }
-}
-
-void kApplyConstraints(gpuContext gpu)
-{
-    kPrepareConstraints_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(gpu->natoms, gpu->sim.pOldPosq, gpu->sim.pPosq, gpu->sim.pPosqP);
-    LAUNCHERROR("kPrepareConstraints");
-    kApplyShake(gpu);
-    kApplySettle(gpu);
-    kApplyCCMA(gpu);
-    kFinishConstraints_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(gpu->natoms, gpu->sim.pPosq, gpu->sim.pPosqP);
-    LAUNCHERROR("kFinishConstraints");
-}
-
--- a/platforms/cuda-old/src/kernels/kBrownianUpdate.cu
+++ b/platforms/cuda-old/src/kernels/kBrownianUpdate.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-//#include <fstream>
-using namespace std;
-
-#include "gputypes.h"
-
-static __constant__ cudaGmxSimulation cSim;
-
-void SetBrownianUpdateSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-
-void GetBrownianUpdateSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
-#endif
-void kBrownianUpdatePart1_kernel()
-{
-    unsigned int pos    = threadIdx.x + blockIdx.x * blockDim.x;
-    unsigned int rpos   = cSim.pRandomPosition[blockIdx.x];
-    __syncthreads();
-    
-    while (pos < cSim.atoms)
-    {
-        float4 random4a         = cSim.pRandom4[rpos + pos];
-        float4 apos             = cSim.pPosq[pos];
-        float4 force            = cSim.pForce4[pos];
-        float invMass           = cSim.pVelm4[pos].w;
-        float forceScale        = cSim.tauDeltaT*invMass;
-        float noiseScale        = cSim.noiseAmplitude*sqrtf(invMass);
-
-        cSim.pOldPosq[pos]      = apos;
-        apos.x                  = force.x*forceScale + noiseScale*random4a.x;
-        apos.y                  = force.y*forceScale + noiseScale*random4a.y;
-        apos.z                  = force.z*forceScale + noiseScale*random4a.z;
-        cSim.pPosqP[pos]        = apos;
-        pos                    += blockDim.x * gridDim.x;
-    }
-}
-
-void kBrownianUpdatePart1(gpuContext gpu)
-{
-//    printf("kBrownianUpdatePart1\n");
-    kBrownianUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
-    LAUNCHERROR("kBrownianUpdatePart1");
-}
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
-#endif
-void kBrownianUpdatePart2_kernel()
-{
-    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
-    unsigned int rpos           = cSim.pRandomPosition[blockIdx.x];
-    __syncthreads();
-    
-    while (pos < cSim.atoms)
-    {
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 apos             = cSim.pPosq[pos];
-        float4 xPrime           = cSim.pPosqP[pos];
-
-        velocity.x              = cSim.oneOverDeltaT*(xPrime.x);
-        velocity.y              = cSim.oneOverDeltaT*(xPrime.y);
-        velocity.z              = cSim.oneOverDeltaT*(xPrime.z);
-
-        xPrime.x               += apos.x;
-        xPrime.y               += apos.y;
-        xPrime.z               += apos.z;
-
-        cSim.pPosq[pos]         = xPrime;
-        cSim.pVelm4[pos]        = velocity;
-         
-        pos                    += blockDim.x * gridDim.x;    
-    }
-
-    // Update random position pointer
-    if (threadIdx.x == 0)
-    {
-        rpos                   += cSim.paddedNumberOfAtoms;
-        if (rpos > cSim.randoms)
-            rpos               -= cSim.randoms;
-        cSim.pRandomPosition[blockIdx.x] = rpos;
-    }
-}
-
-extern void kGenerateRandoms(gpuContext gpu);
-void kBrownianUpdatePart2(gpuContext gpu)
-{
-//    printf("kBrownianUpdatePart2\n");
-    kBrownianUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
-    LAUNCHERROR("kBrownianUpdatePart2");
-    
-    // Update randoms if necessary
-    gpu->iterations++;
-    if (gpu->iterations == gpu->sim.randomIterations)
-    {
-        kGenerateRandoms(gpu);
-        gpu->iterations = 0;
-    }
-}
-
--- a/platforms/cuda-old/src/kernels/kCCMA.cu
+++ b/platforms/cuda-old/src/kernels/kCCMA.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include <cuda.h>
-#include <vector_functions.h>
-#include <vector>
-#include "gputypes.h"
-
-using namespace std;
-
-
-static __constant__ cudaGmxSimulation cSim;
-
-void SetCCMASim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-
-void GetCCMASim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-
-__global__ void
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(1024, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(512, 1)
-#else
-__launch_bounds__(256, 1)
-#endif
-kComputeCCMAConstraintDirections()
-{
-    // Calculate the direction of each constraint.
-
-    for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.ccmaConstraints; index += blockDim.x*gridDim.x)
-    {
-        int2 atoms = cSim.pCcmaAtoms[index];
-        float4 dir = cSim.pCcmaDistance[index];
-        float4 oldPos1 = cSim.pOldPosq[atoms.x];
-        float4 oldPos2 = cSim.pOldPosq[atoms.y];
-        dir.x = oldPos1.x-oldPos2.x;
-        dir.y = oldPos1.y-oldPos2.y;
-        dir.z = oldPos1.z-oldPos2.z;
-        cSim.pCcmaDistance[index] = dir;
-    }
-}
-
-__global__ void
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(1024, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(512, 1)
-#else
-__launch_bounds__(256, 1)
-#endif
-kComputeCCMAConstraintForces(float4* atomPositions, bool addOldPosition)
-{
-    __shared__ int converged;
-    float lowerTol = 1.0f-2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
-    float upperTol = 1.0f+2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
-    if (threadIdx.x == 0)
-        converged = 1;
-    __syncthreads();
-
-    // Calculate the constraint force for each constraint.
-
-    for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.ccmaConstraints; index += blockDim.x*gridDim.x)
-    {
-        int2 atoms = cSim.pCcmaAtoms[index];
-        float4 delta1 = atomPositions[atoms.x];
-        float4 delta2 = atomPositions[atoms.y];
-        float4 dir = cSim.pCcmaDistance[index];
-        float3 rp_ij = make_float3(delta1.x-delta2.x, delta1.y-delta2.y, delta1.z-delta2.z);
-        if (addOldPosition)
-        {
-            rp_ij.x += dir.x;
-            rp_ij.y += dir.y;
-            rp_ij.z += dir.z;
-        }
-        float rp2 = rp_ij.x*rp_ij.x + rp_ij.y*rp_ij.y + rp_ij.z*rp_ij.z;
-        float dist2 = dir.w*dir.w;
-        float diff = dist2 - rp2;
-        float rrpr  = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
-        float d_ij2  = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
-        float reducedMass = cSim.pCcmaReducedMass[index];
-        cSim.pCcmaDelta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass*diff/rrpr : 0.0f);
-
-        // See whether it has converged.
-
-        if (converged && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2))
-        {
-            converged = 0;
-            *cSim.ccmaConvergedDeviceMarker = 0;
-        }
-    }
-}
-
-__global__ void
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(1024, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(512, 1)
-#else
-__launch_bounds__(256, 1)
-#endif
-kMultiplyByCCMAConstraintMatrix()
-{
-    if (*cSim.ccmaConvergedDeviceMarker)
-        return; // The constraint iteration has already converged
-
-    // Multiply by the inverse constraint matrix.
-
-    for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.ccmaConstraints; index += blockDim.x*gridDim.x)
-    {
-        float sum = 0.0f;
-        for (unsigned int i = 0; ; i++)
-        {
-            unsigned int element = index+i*cSim.ccmaConstraints;
-            unsigned int column = cSim.pConstraintMatrixColumn[element];
-            if (column >= cSim.ccmaConstraints)
-                break;
-            sum += cSim.pCcmaDelta1[column]*cSim.pConstraintMatrixValue[element];
-        }
-        cSim.pCcmaDelta2[index] = sum;
-    }
-}
-
-__global__ void
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(1024, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(512, 1)
-#else
-__launch_bounds__(256, 1)
-#endif
-kUpdateCCMAAtomPositions(float4* atomPositions, int iteration)
-{
-    if (*cSim.ccmaConvergedDeviceMarker)
-        return; // The constraint iteration has already converged.
-    float damping = (iteration < 2 ? 0.5f : 1.0f);
-    for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.atoms; index += blockDim.x*gridDim.x)
-    {
-        float4 atomPos = atomPositions[index];
-        float invMass = cSim.pVelm4[index].w;
-        int num = cSim.pCcmaNumAtomConstraints[index];
-        for (int i = 0; i < num; i++)
-        {
-            int constraint = cSim.pCcmaAtomConstraints[index+i*cSim.atoms];
-            bool forward = (constraint > 0);
-            constraint = (forward ? constraint-1 : -constraint-1);
-            float constraintForce = damping*invMass*cSim.pCcmaDelta2[constraint];
-            constraintForce = (forward ? constraintForce : -constraintForce);
-            float4 dir = cSim.pCcmaDistance[constraint];
-            atomPos.x += constraintForce*dir.x;
-            atomPos.y += constraintForce*dir.y;
-            atomPos.z += constraintForce*dir.z;
-        }
-        atomPositions[index] = atomPos;
-    }
-}
-
-void kApplyCCMA(gpuContext gpu, float4* posq, bool addOldPosition)
-{
-    kComputeCCMAConstraintDirections<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>();
-    LAUNCHERROR("kComputeCCMAConstraintDirections");
-    const int checkInterval = 3;
-    for (int i = 0; i < 150; i++) {
-        if ((i+1)%checkInterval == 0)
-            *gpu->ccmaConvergedHostMarker = 1;
-        kComputeCCMAConstraintForces<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block, gpu->sim.ccma_threads_per_block*sizeof(int)>>>(posq, addOldPosition);
-        cudaEventRecord(gpu->ccmaEvent, 0);
-        kMultiplyByCCMAConstraintMatrix<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block, gpu->sim.ccma_threads_per_block*sizeof(int)>>>();
-        kUpdateCCMAAtomPositions<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>(posq, 3*i+2);
-        cudaEventSynchronize(gpu->ccmaEvent);
-        if ((i+1)%checkInterval == 0 && *gpu->ccmaConvergedHostMarker)
-            break;
-    }
-}
-
-void kApplyCCMA(gpuContext gpu)
-{
-//    printf("kApplyCCMA\n");
-    if (gpu->sim.ccmaConstraints > 0)
-        kApplyCCMA(gpu, gpu->sim.pPosqP, true);
-}
--- a/platforms/cuda-old/src/kernels/kCalculateAndersenThermostat.cu
+++ b/platforms/cuda-old/src/kernels/kCalculateAndersenThermostat.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-//#include <fstream>
-using namespace std;
-
-#include "gputypes.h"
-
-static __constant__ cudaGmxSimulation cSim;
-
-void SetCalculateAndersenThermostatSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-
-void GetCalculateAndersenThermostatSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-
-__global__ void kCalculateAndersenThermostat_kernel(int* atomGroups)
-{
-    unsigned int pos            = threadIdx.x + blockIdx.x * blockDim.x;
-    unsigned int rpos           = cSim.pRandomPosition[blockIdx.x];
-    __syncthreads();
-
-    float collisionProbability = 1.0f-exp(-cSim.collisionFrequency*cSim.pStepSize[0].y);
-    float randomRange = erf(collisionProbability/sqrtf(2.0f));
-    while (pos < cSim.atoms)
-    {
-        float4 velocity         = cSim.pVelm4[pos];
-        float4 selectRand       = cSim.pRandom4[rpos + atomGroups[pos]];
-        float4 velRand          = cSim.pRandom4[rpos + pos];
-        float scale = (selectRand.w > -randomRange && selectRand.w < randomRange ? 0.0f : 1.0f);
-        float add = (1.0f-scale)*sqrtf(cSim.kT*velocity.w);
-        velocity.x = scale*velocity.x + add*velRand.x;
-        velocity.y = scale*velocity.y + add*velRand.y;
-        velocity.z = scale*velocity.z + add*velRand.z;
-        cSim.pVelm4[pos]        = velocity;
-
-        pos                    += blockDim.x * gridDim.x;
-    }
-
-    // Update random position pointer
-    if (threadIdx.x == 0)
-    {
-        rpos                   += cSim.paddedNumberOfAtoms;
-        if (rpos > cSim.randoms)
-            rpos               -= cSim.randoms;
-        cSim.pRandomPosition[blockIdx.x] = rpos;
-    }
-}
-
-extern void kGenerateRandoms(gpuContext gpu);
-void kCalculateAndersenThermostat(gpuContext gpu, CUDAStream<int>& atomGroups)
-{
-//    printf("kCalculateAndersenThermostat\n");
-    kCalculateAndersenThermostat_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(atomGroups._pDevData);
-    LAUNCHERROR("kCalculateAndersenThermostat");
-    
-    // Update randoms if necessary
-    gpu->iterations++;
-    if (gpu->iterations == gpu->sim.randomIterations)
-    {
-        kGenerateRandoms(gpu);
-        gpu->iterations = 0;
-    }
-}
-
--- a/platforms/cuda-old/src/kernels/kCalculateCDLJEwaldFastReciprocal.h
+++ b/platforms/cuda-old/src/kernels/kCalculateCDLJEwaldFastReciprocal.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Rossen P. Apostolov, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-/**
- * This file contains the kernel for evaluating nonbonded forces using the
- * Ewald summation method (Reciprocal space summation).
- */
-
-/* Define multiply operations for floats */
-
-__device__ float2 MultofFloat2(float2 a, float2 b)
-{
-    float2 c;
-    c.x = a.x * b.x - a.y * b.y;
-    c.y = a.x * b.y + a.y * b.x;
-    return c;
-}
-
-__device__ float2 ConjMultofFloat2(float2 a, float2 b)
-{
-    float2 c;
-    c.x = a.x*b.x + a.y*b.y;
-    c.y = a.y*b.x - a.x*b.y;
-    return c;
-}
-
-/**
- * Precompute the cosine and sine sums which appear in each force term.
- */
-
-__global__ void kCalculateEwaldFastCosSinSums_kernel()
-{
-    const float epsilon =  1.0;
-    const float recipCoeff = cSim.epsfac*(4*LOCAL_HACK_PI/cSim.cellVolume/epsilon);
-    const unsigned int ksizex = 2*cSim.kmaxX-1;
-    const unsigned int ksizey = 2*cSim.kmaxY-1;
-    const unsigned int ksizez = 2*cSim.kmaxZ-1;
-    const unsigned int totalK = ksizex*ksizey*ksizez;
-    unsigned int index = threadIdx.x + blockIdx.x * blockDim.x;
-    float energy = 0.0f;
-    while (index < (cSim.kmaxY-1)*ksizez+cSim.kmaxZ)
-        index += blockDim.x * gridDim.x;
-    while (index < totalK)
-    {
-        // Find the wave vector (kx, ky, kz) this index corresponds to.
-
-        int rx = index/(ksizey*ksizez);
-        int remainder = index - rx*ksizey*ksizez;
-        int ry = remainder/ksizez;
-        int rz = remainder - ry*ksizez - cSim.kmaxZ + 1;
-        ry += -cSim.kmaxY + 1;
-        float kx = rx*cSim.recipBoxSizeX;
-        float ky = ry*cSim.recipBoxSizeY;
-        float kz = rz*cSim.recipBoxSizeZ;
-
-        // Compute the sum for this wave vector.
-
-        float2 sum = make_float2(0.0f, 0.0f);
-        for (int atom = 0; atom < cSim.atoms; atom++)
-        {
-            float4 apos = cSim.pPosq[atom];
-            float phase = apos.x*kx;
-            float2 structureFactor = make_float2(cosf(phase), sinf(phase));
-            phase = apos.y*ky;
-            structureFactor = MultofFloat2(structureFactor, make_float2(cosf(phase), sinf(phase)));
-            phase = apos.z*kz;
-            structureFactor = MultofFloat2(structureFactor, make_float2(cosf(phase), sinf(phase)));
-            sum.x += apos.w*structureFactor.x;
-            sum.y += apos.w*structureFactor.y;
-        }
-        cSim.pEwaldCosSinSum[index] = sum;
-
-        // Compute the contribution to the energy.
-
-        float k2 = kx*kx + ky*ky + kz*kz;
-        float ak = exp(k2*cSim.factorEwald) / k2;
-        energy += recipCoeff*ak*(sum.x*sum.x + sum.y*sum.y);
-        index += blockDim.x * gridDim.x;
-    }
-    cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
-}
-
-/**
- * Compute the reciprocal space part of the Ewald force, using the precomputed sums from the
- * previous routine.
- */
-
-__global__ void kCalculateEwaldFastForces_kernel()
-{
-    const float epsilon =  1.0;
-    float recipCoeff = cSim.epsfac*(4*LOCAL_HACK_PI/cSim.cellVolume/epsilon);
-
-    unsigned int atom = threadIdx.x + blockIdx.x * blockDim.x;
-
-    while (atom < cSim.atoms)
-    {
-        float4 force = cSim.pForce4[atom];
-        float4 apos = cSim.pPosq[atom];
-
-        // Loop over all wave vectors.
-
-        int lowry = 0;
-        int lowrz = 1;
-        for (int rx = 0; rx < cSim.kmaxX; rx++) {
-            float kx = rx * cSim.recipBoxSizeX;
-            for (int ry = lowry; ry < cSim.kmaxY; ry++) {
-                float ky = ry * cSim.recipBoxSizeY;
-                float phase = apos.x*kx;
-                float2 tab_xy = make_float2(cosf(phase), sinf(phase));
-                phase = apos.y*ky;
-                tab_xy = MultofFloat2(tab_xy, make_float2(cosf(phase), sinf(phase)));
-                for (int rz = lowrz; rz < cSim.kmaxZ; rz++) {
-                    float kz = rz * cSim.recipBoxSizeZ;
-
-                    // Compute the force contribution of this wave vector.
-
-                    int index = rx*(cSim.kmaxY*2-1)*(cSim.kmaxZ*2-1) + (ry+cSim.kmaxY-1)*(cSim.kmaxZ*2-1) + (rz+cSim.kmaxZ-1);
-                    float k2 = kx*kx + ky*ky + kz*kz;
-                    float ak = exp(k2*cSim.factorEwald) / k2;
-                    phase = apos.z*kz;
-                    float2 structureFactor = MultofFloat2(tab_xy, make_float2(cosf(phase), sinf(phase)));
-                    float2 cosSinSum = cSim.pEwaldCosSinSum[index];
-                    float dEdR = ak * apos.w * (cosSinSum.x*structureFactor.y - cosSinSum.y*structureFactor.x);
-                    force.x += 2 * recipCoeff * dEdR * kx;
-                    force.y += 2 * recipCoeff * dEdR * ky;
-                    force.z += 2 * recipCoeff * dEdR * kz;
-                    lowrz = 1 - cSim.kmaxZ;
-                }
-                lowry = 1 - cSim.kmaxY;
-            }
-        }
-
-        // Record the force on the atom.
-
-        cSim.pForce4[atom] = force;
-        atom += blockDim.x * gridDim.x;
-    }
-}
--- a/platforms/cuda-old/src/kernels/kCalculateCDLJEwaldReciprocal.h
+++ b/platforms/cuda-old/src/kernels/kCalculateCDLJEwaldReciprocal.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Rossen P. Apostolov, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-/**
- * This file contains the kernel for evaluating nonbonded forces using the
- * Ewald summation method (Reciprocal space summation).
- */
-
-
-__global__ void kCalculateCDLJEwaldReciprocalForces_kernel()
-{
-    const float eps0 = 1.0f/(4.0f*3.1415926535f*cSim.epsfac);
-
-    unsigned int atomID1    = threadIdx.x + blockIdx.x * blockDim.x;
-
-    while (atomID1 < cSim.atoms)
-    {
-        float4 apos1 = cSim.pPosq[atomID1];
-        float4 af    = cSim.pForce4[atomID1];
-        unsigned int atomID2 = 0;
-        while (atomID2 < cSim.atoms)
-        {
-            float4 apos2 = cSim.pPosq[atomID2];
-            float scale = 2.0f*apos1.w*apos2.w/(cSim.cellVolume*eps0);
-
-            int lowry = 0;
-            int lowrz = 1;
-
-            for(int rx = 0; rx < cSim.kmaxX; rx++)
-            {
-                float kx = rx*cSim.recipBoxSizeX;
-                for(int ry = lowry; ry < cSim.kmaxY; ry++)
-                {
-                    float ky = ry*cSim.recipBoxSizeY;
-                    for (int rz = lowrz; rz < cSim.kmaxZ; rz++)
-                    {
-                        float kz = rz*cSim.recipBoxSizeZ;
-                        float k2 = kx*kx + ky*ky + kz*kz;
-                        float ek = exp(k2*cSim.factorEwald);
-
-                        float arg1 = kx*apos1.x + ky*apos1.y + kz*apos1.z;
-                        float arg2 = kx*apos2.x + ky*apos2.y + kz*apos2.z;
-                        float sinI = sinf(arg1);
-                        float sinJ = sinf(arg2);
-                        float cosI = cosf(arg1);
-                        float cosJ = cosf(arg2);
-
-                        float f = scale * ek * (-sinI*cosJ + cosI*sinJ) / k2;
-                        af.x -= kx*f;
-                        af.y -= ky*f;
-                        af.z -= kz*f;
-
-                        lowrz = 1 - cSim.kmaxZ;
-                    }
-                    lowry = 1 - cSim.kmaxY;
-                }
-            }
-            atomID2++;
-       }
-       cSim.pForce4[atomID1] = af;
-       atomID1 += blockDim.x * gridDim.x;
-
-    }
-}
--- a/platforms/cuda-old/src/kernels/kCalculateCDLJForces.cu
+++ b/platforms/cuda-old/src/kernels/kCalculateCDLJForces.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-
-#include "gputypes.h"
-#include "cudatypes.h"
-
-#define UNROLLXX 0
-#define UNROLLXY 0
-
-struct Atom {
-    float x;
-    float y;
-    float z;
-    float q;
-    float sig;
-    float eps;
-    float fx;
-    float fy;
-    float fz;
-};
-
-static __constant__ cudaGmxSimulation cSim;
-
-void SetCalculateCDLJForcesSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-
-void GetCalculateCDLJForcesSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-
-texture<float, 1, cudaReadModeElementType> tabulatedErfcRef;
-
-static __device__ float fastErfc(float r)
-{
-    float normalized = cSim.tabulatedErfcScale*r;
-    int index = (int) normalized;
-    float fract2 = normalized-index;
-    float fract1 = 1.0f-fract2;
-    return fract1*tex1Dfetch(tabulatedErfcRef, index) + fract2*tex1Dfetch(tabulatedErfcRef, index+1);
-}
-
-// Include versions of the kernels for N^2 calculations.
-
-#define METHOD_NAME(a, b) a##N2##b
-#include "kCalculateCDLJForces.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateCDLJForces.h"
-
-// Include versions of the kernels with cutoffs.
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_CUTOFF
-#define METHOD_NAME(a, b) a##Cutoff##b
-#include "kCalculateCDLJForces.h"
-#include "kFindInteractingBlocks.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##CutoffByWarp##b
-#include "kCalculateCDLJForces.h"
-
-// Include versions of the kernels with periodic boundary conditions.
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_PERIODIC
-#define METHOD_NAME(a, b) a##Periodic##b
-#include "kCalculateCDLJForces.h"
-#include "kFindInteractingBlocks.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##PeriodicByWarp##b
-#include "kCalculateCDLJForces.h"
-
-// Include versions of the kernels for Ewald
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_PERIODIC
-#define USE_EWALD
-#define METHOD_NAME(a, b) a##Ewald##b
-#include "kCalculateCDLJForces.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##EwaldByWarp##b
-#include "kCalculateCDLJForces.h"
-
-// Reciprocal Space Ewald summation is in a separate kernel
-#include "kCalculateCDLJEwaldFastReciprocal.h"
-
-void kCalculatePME(gpuContext gpu);
-
-void kCalculateCDLJForces(gpuContext gpu)
-{
-//    printf("kCalculateCDLJCutoffForces\n");
-    switch (gpu->sim.nonbondedMethod)
-    {
-        case NO_CUTOFF:
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
-            else
-                kCalculateCDLJN2Forces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
-            LAUNCHERROR("kCalculateCDLJN2Forces");
-            break;
-        case CUTOFF:
-            kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
-            LAUNCHERROR("kFindBlockBoundsCutoff");
-            kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
-            LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
-            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-            kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateCDLJCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            LAUNCHERROR("kCalculateCDLJCutoffForces");
-            break;
-        case PERIODIC:
-            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
-            LAUNCHERROR("kFindBlockBoundsPeriodic");
-            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
-            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
-            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateCDLJPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            LAUNCHERROR("kCalculateCDLJPeriodicForces");
-            break;
-        case EWALD:
-        case PARTICLE_MESH_EWALD:
-            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
-            LAUNCHERROR("kFindBlockBoundsPeriodic");
-            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
-            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
-            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            LAUNCHERROR("kFindInteractionsWithinBlocksPeriodic");
-            cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
-            cudaBindTexture(NULL, &tabulatedErfcRef, gpu->psTabulatedErfc->_pDevData, &channelDesc, gpu->psTabulatedErfc->_length*sizeof(float));
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJEwaldByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateCDLJEwaldForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            LAUNCHERROR("kCalculateCDLJEwaldForces");
-            if (gpu->sim.nonbondedMethod == EWALD)
-            {
-                kCalculateEwaldFastCosSinSums_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
-                LAUNCHERROR("kCalculateEwaldFastCosSinSums");
-                kCalculateEwaldFastForces_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
-                LAUNCHERROR("kCalculateEwaldFastForces");
-            }
-            else
-                kCalculatePME(gpu);
-    }
-}
--- a/platforms/cuda-old/src/kernels/kCalculateCDLJForces.h
+++ b/platforms/cuda-old/src/kernels/kCalculateCDLJForces.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-/**
- * This file contains the kernels for evalauating nonbonded forces.  It is included
- * several times in kCalculateCDLJForces.cu with different #defines to generate
- * different versions of the kernels.
- */
-
-/* Cuda compiler on Windows does not recognized "static const float" values */
-#define LOCAL_HACK_PI 3.1415926535897932384626433832795f
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
-#endif
-void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit)
-{
-    extern __shared__ volatile Atom sA[];
-    unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    unsigned int numWorkUnits = cSim.pInteractionCount[0];
-    unsigned int pos = warp*numWorkUnits/totalWarps;
-    unsigned int end = (warp+1)*numWorkUnits/totalWarps;
-    float CDLJ_energy;
-    float energy = 0.0f;
-#ifdef USE_CUTOFF
-    volatile float3* tempBuffer = (volatile float3*) &sA[cSim.nonbond_threads_per_block];
-#endif
-
-#ifdef USE_EWALD
-    const float TWO_OVER_SQRT_PI = 2.0f/sqrtf(LOCAL_HACK_PI);
-#endif
-
-    unsigned int lasty = 0xFFFFFFFF;
-    while (pos < end)
-    {
-
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x = workUnit[pos];
-        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
-        bool bExclusionFlag = (x & 0x1);
-        x = (x >> 17) << GRIDBITS;
-        float4      apos;   // Local atom x, y, z, q
-        float3      af;     // Local atom fx, fy, fz
-        float dx;
-        float dy;
-        float dz;
-        float r2;
-        float invR;
-        float sig;
-        float sig2;
-        float sig6;
-        float eps;
-        float dEdR;
-        unsigned int tgx = threadIdx.x & (GRID - 1);
-        unsigned int tbx = threadIdx.x - tgx;
-        unsigned int tj = tgx;
-        volatile Atom* psA = &sA[tbx];
-        unsigned int i      = x + tgx;
-        apos                = cSim.pPosq[i];
-        float2 a            = cSim.pAttr[i];
-        af.x                = 0.0f;
-        af.y                = 0.0f;
-        af.z                = 0.0f;
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        {
-            // Read fixed atom data into registers and GRF
-            sA[threadIdx.x].x   = apos.x;
-            sA[threadIdx.x].y   = apos.y;
-            sA[threadIdx.x].z   = apos.z;
-            sA[threadIdx.x].q   = apos.w;
-            sA[threadIdx.x].sig = a.x;
-            sA[threadIdx.x].eps = a.y;
-            apos.w             *= cSim.epsfac;
-            if (!bExclusionFlag)
-            {
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    dx              = psA[j].x - apos.x;
-                    dy              = psA[j].y - apos.y;
-                    dz              = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    r2              = dx * dx + dy * dy + dz * dz;
-                    invR            = 1.0f / sqrtf(r2);
-                    sig             = a.x + psA[j].sig;
-                    sig2            = invR * sig;
-                    sig2           *= sig2;
-                    sig6            = sig2 * sig2 * sig2;
-                    eps             = a.y * psA[j].eps;
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
-		    /* E */ 
-		    CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
-#ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                    float r         = sqrtf(r2);
-                    float alphaR    = cSim.alphaEwald * r;
-                    float erfcAlphaR = fastErfc(alphaR);
-                    dEdR           += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI );
-		    /* E */
-                    CDLJ_energy    += apos.w * psA[j].q * invR * erfcAlphaR;
-    #else
-                    dEdR           += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-		    /* E */
-		    CDLJ_energy    += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-    #endif
-#else
-                    dEdR           += apos.w * psA[j].q * invR;
-		    /* E */
-		    CDLJ_energy    += apos.w * psA[j].q * invR;
-#endif
-                    dEdR           *= invR * invR;
-#ifdef USE_CUTOFF
-                    if (r2 > cSim.nonbondedCutoffSqr)
-                    {
-                        dEdR = 0.0f;
-			/* E */
-			CDLJ_energy = 0.0f;
-                    }
-#endif
-		    /* E */
-		    energy         += 0.5f*CDLJ_energy;
-                    dx             *= dEdR;
-                    dy             *= dEdR;
-                    dz             *= dEdR;
-                    af.x           -= dx;
-                    af.y           -= dy;
-                    af.z           -= dz;
-                }
-            }
-            else  // bExclusion
-            {
-                unsigned int xi   = x>>GRIDBITS;
-                unsigned int cell          = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
-                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    dx              = psA[j].x - apos.x;
-                    dy              = psA[j].y - apos.y;
-                    dz              = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    r2              = dx * dx + dy * dy + dz * dz;
-                    invR            = 1.0f / sqrtf(r2);
-                    sig             = a.x + psA[j].sig;
-                    sig2            = invR * sig;
-                    sig2           *= sig2;
-                    sig6            = sig2 * sig2 * sig2;
-                    eps             = a.y * psA[j].eps;
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
-		    /* E */
-		    CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
-#ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                    float r         = sqrtf(r2);
-                    float alphaR    = cSim.alphaEwald * r;
-                    float erfcAlphaR = fastErfc(alphaR);
-                    dEdR           += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-		    /* E */
-		    CDLJ_energy    += apos.w * psA[j].q * invR * erfcAlphaR;
-                    bool needCorrection = !(excl & 0x1) && x+tgx != y+j && x+tgx < cSim.atoms && y+j < cSim.atoms;
-                    if (needCorrection)
-                    {
-                        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
-
-                        dEdR        = -apos.w * psA[j].q * invR * ((1.0f-erfcAlphaR) - alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-                        CDLJ_energy = -apos.w * psA[j].q * invR * (1.0f-erfcAlphaR);
-                    }
-    #else
-                    dEdR           += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-                    /* E */
-		    CDLJ_energy    += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-    #endif
-#else
-                    dEdR           += apos.w * psA[j].q * invR;
-                    /* E */
-		    CDLJ_energy    += apos.w * psA[j].q * invR;
-#endif
-                    dEdR           *= invR * invR;
-#ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                    if (!needCorrection && (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr))
-    #else
-                    if (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr)
-    #endif
-#else
-                    if (!(excl & 0x1))
-#endif
-                    {
-                        dEdR = 0.0f;
-			/* E */
-		        CDLJ_energy  = 0.0f;
-                    }
-		    /* E */
-                    energy         += 0.5f*CDLJ_energy;
-                    dx             *= dEdR;
-                    dy             *= dEdR;
-                    dz             *= dEdR;
-                    af.x           -= dx;
-                    af.y           -= dy;
-                    af.z           -= dz;
-                    excl          >>= 1;
-                }
-            }
-
-            // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                 = x + tgx + warp*cSim.stride;
-#else
-            unsigned int offset                 = x + tgx + (x >> GRIDBITS) * cSim.stride;
-#endif
-            float4 of                           = cSim.pForce4[offset];
-            of.x                               += af.x;
-            of.y                               += af.y;
-            of.z                               += af.z;
-            cSim.pForce4[offset]               = of;
-        }
-        else        // 100% utilization
-        {
-            // Read fixed atom data into registers and GRF
-            if (lasty != y)
-            {
-                unsigned int j                   = y + tgx;
-                float4 temp             = cSim.pPosq[j];
-                float2 temp1            = cSim.pAttr[j];
-                sA[threadIdx.x].x       = temp.x;
-                sA[threadIdx.x].y       = temp.y;
-                sA[threadIdx.x].z       = temp.z;
-                sA[threadIdx.x].q       = temp.w;
-                sA[threadIdx.x].sig     = temp1.x;
-                sA[threadIdx.x].eps     = temp1.y;
-            }
-            sA[threadIdx.x].fx      = 0.0f;
-            sA[threadIdx.x].fy      = 0.0f;
-            sA[threadIdx.x].fz      = 0.0f;
-            apos.w                 *= cSim.epsfac;
-            if (!bExclusionFlag)
-            {
-#ifdef USE_CUTOFF
-                unsigned int flags = cSim.pInteractionFlag[pos];
-                if (flags == 0)
-                {
-                    // No interactions in this block.
-                }
-                else if (flags == 0xFFFFFFFF)
-#endif
-                {
-                    // Compute all interactions within this block.
-
-                    for (unsigned int j = 0; j < GRID; j++)
-                    {
-                        dx              = psA[tj].x - apos.x;
-                        dy              = psA[tj].y - apos.y;
-                        dz              = psA[tj].z - apos.z;
-#ifdef USE_PERIODIC
-                        dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                        dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                        dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                        r2              = dx * dx + dy * dy + dz * dz;
-                        invR            = 1.0f / sqrtf(r2);
-                        sig             = a.x + psA[tj].sig;
-                        sig2            = invR * sig;
-                        sig2           *= sig2;
-                        sig6            = sig2 * sig2 * sig2;
-                        eps             = a.y * psA[tj].eps;
-                        dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
-			/* E */
-			CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
-#ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                        float r         = sqrtf(r2);
-                        float alphaR    = cSim.alphaEwald * r;
-                        float erfcAlphaR = fastErfc(alphaR);
-                        dEdR           += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-                        /* E */
-                        CDLJ_energy    += apos.w * psA[tj].q * invR * erfcAlphaR;
-    #else
-                        dEdR           += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-			/* E */
-                        CDLJ_energy    += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-    #endif
-#else
-                        dEdR           += apos.w * psA[tj].q * invR;
-                        /* E */
-                        CDLJ_energy    += apos.w * psA[tj].q * invR;
-#endif
-                        dEdR           *= invR * invR;
-#ifdef USE_CUTOFF
-                        if (r2 > cSim.nonbondedCutoffSqr)
-                        {
-                            dEdR = 0.0f;
-			    /* E */
-       			    CDLJ_energy = 0.0f;
-                        }
-#endif
-			/* E */
-			energy         += CDLJ_energy;
-                        dx             *= dEdR;
-                        dy             *= dEdR;
-                        dz             *= dEdR;
-                        af.x           -= dx;
-                        af.y           -= dy;
-                        af.z           -= dz;
-                        psA[tj].fx     += dx;
-                        psA[tj].fy     += dy;
-                        psA[tj].fz     += dz;
-                        tj              = (tj + 1) & (GRID - 1);
-                    }
-                }
-#ifdef USE_CUTOFF
-                else
-                {
-                    // Compute only a subset of the interactions in this block.
-
-                    for (unsigned int j = 0; j < GRID; j++)
-                    {
-                        if ((flags&(1<<j)) != 0)
-                        {
-                            dx              = psA[j].x - apos.x;
-                            dy              = psA[j].y - apos.y;
-                            dz              = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                            dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                            dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                            dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                            r2              = dx * dx + dy * dy + dz * dz;
-                            invR            = 1.0f / sqrtf(r2);
-                            sig             = a.x + psA[j].sig;
-                            sig2            = invR * sig;
-                            sig2           *= sig2;
-                            sig6            = sig2 * sig2 * sig2;
-                            eps             = a.y * psA[j].eps;
-                            dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
-			    /* E */
-			    CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
-#ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                            float r         = sqrtf(r2);
-                            float alphaR    = cSim.alphaEwald * r;
-                            float erfcAlphaR = fastErfc(alphaR);
-                            dEdR           += apos.w * psA[j].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-                            CDLJ_energy    += apos.w * psA[j].q * invR * erfcAlphaR;
-    #else
-                            dEdR           += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-                            /* E */
-                            CDLJ_energy    += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-    #endif
-#else
-                            dEdR           += apos.w * psA[j].q * invR;
-                            /* E */
-                            CDLJ_energy    += apos.w * psA[j].q * invR;
-#endif
-                            dEdR           *= invR * invR;
-#ifdef USE_CUTOFF
-                            if (r2 > cSim.nonbondedCutoffSqr)
-                            {
-                                dEdR = 0.0f;
-				/* E */
-				CDLJ_energy = 0.0f;
-                            }
-#endif
-			    /* E */
-			    energy         += CDLJ_energy;
-                            dx             *= dEdR;
-                            dy             *= dEdR;
-                            dz             *= dEdR;
-                            af.x           -= dx;
-                            af.y           -= dy;
-                            af.z           -= dz;
-                            tempBuffer[threadIdx.x].x = dx;
-                            tempBuffer[threadIdx.x].y = dy;
-                            tempBuffer[threadIdx.x].z = dz;
-
-                            // Sum the forces on atom j.
-
-                            if (tgx % 2 == 0)
-                            {
-                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
-                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
-                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
-                            }
-                            if (tgx % 4 == 0)
-                            {
-                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
-                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
-                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
-                            }
-                            if (tgx % 8 == 0)
-                            {
-                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
-                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
-                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
-                            }
-                            if (tgx % 16 == 0)
-                            {
-                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
-                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
-                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
-                            }
-                            if (tgx == 0)
-                            {
-                                psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
-                                psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
-                                psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
-                            }
-                        }
-                    }
-                }
-#endif
-            }
-            else  // bExclusion
-            {
-                // Read fixed atom data into registers and GRF
-                unsigned int xi   = x>>GRIDBITS;
-                unsigned int yi   = y>>GRIDBITS;
-                unsigned int cell          = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
-                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
-                excl              = (excl >> tgx) | (excl << (GRID - tgx));
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    dx              = psA[tj].x - apos.x;
-                    dy              = psA[tj].y - apos.y;
-                    dz              = psA[tj].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    r2              = dx * dx + dy * dy + dz * dz;
-                    invR            = 1.0f / sqrtf(r2);
-                    sig             = a.x + psA[tj].sig;
-                    sig2            = invR * sig;
-                    sig2           *= sig2;
-                    sig6            = sig2 * sig2 * sig2;
-                    eps             = a.y * psA[tj].eps;
-                    dEdR            = eps * (12.0f * sig6 - 6.0f) * sig6;
-		    /* E */
-		    CDLJ_energy     = eps * (sig6 - 1.0f) * sig6;
-#ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                    float r         = sqrtf(r2);
-                    float alphaR    = cSim.alphaEwald * r;
-                    float erfcAlphaR = fastErfc(alphaR);
-                    dEdR           += apos.w * psA[tj].q * invR * (erfcAlphaR + alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-                    /* E */
-                    CDLJ_energy    += apos.w * psA[tj].q * invR * erfcAlphaR;
-                    bool needCorrection = !(excl & 0x1) && x+tgx != y+tj && x+tgx < cSim.atoms && y+tj < cSim.atoms;
-                    if (needCorrection)
-                    {
-                        // Subtract off the part of this interaction that was included in the reciprocal space contribution.
-
-                        dEdR        = -apos.w * psA[tj].q * invR * ((1.0f-erfcAlphaR) - alphaR * exp ( - alphaR * alphaR) * TWO_OVER_SQRT_PI);
-                        CDLJ_energy = -apos.w * psA[tj].q * invR * (1.0f-erfcAlphaR);
-                    }
-    #else
-                    dEdR           += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-                    /* E */
-		    CDLJ_energy    += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-    #endif
-#else
-                    dEdR           += apos.w * psA[tj].q * invR;
-                    /* E */
-                    CDLJ_energy    += apos.w * psA[tj].q * invR;
-#endif
-                    dEdR           *= invR * invR;
-#ifdef USE_CUTOFF
-    #ifdef USE_EWALD
-                    if (!needCorrection && (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr))
-    #else
-                    if (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr)
-    #endif
-#else
-                    if (!(excl & 0x1))
-#endif
-                    {
-                        dEdR = 0.0f;			
-                        /* E */
-			CDLJ_energy  = 0.0f;
-                    }
-		    /* E */
-		    energy         += CDLJ_energy;
-                    dx             *= dEdR;
-                    dy             *= dEdR;
-                    dz             *= dEdR;
-                    af.x           -= dx;
-                    af.y           -= dy;
-                    af.z           -= dz;
-                    psA[tj].fx     += dx;
-                    psA[tj].fy     += dy;
-                    psA[tj].fz     += dz;
-                    excl          >>= 1;
-                    tj              = (tj + 1) & (GRID - 1);
-                }
-            }
-
-            // Write results
-            float4 of;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                 = x + tgx + warp*cSim.stride;
-#else
-            unsigned int offset                 = x + tgx + (y >> GRIDBITS) * cSim.stride;
-#endif
-            of                                  = cSim.pForce4[offset];
-            of.x                               += af.x;
-            of.y                               += af.y;
-            of.z                               += af.z;
-            cSim.pForce4[offset]               = of;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            offset                              = y + tgx + warp*cSim.stride;
-#else
-            offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
-#endif
-            of                                  = cSim.pForce4[offset];
-            of.x                               += sA[threadIdx.x].fx;
-            of.y                               += sA[threadIdx.x].fy;
-            of.z                               += sA[threadIdx.x].fz;
-            cSim.pForce4[offset]               = of;
-            lasty = y;
-        }
-
-        pos++;
-    }
-    cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
-}
--- a/platforms/cuda-old/src/kernels/kCalculateCDLJObcGbsaForces1.cu
+++ b/platforms/cuda-old/src/kernels/kCalculateCDLJObcGbsaForces1.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-using namespace std;
-
-#include "gputypes.h"
-#include "cudatypes.h"
-#include "cudaKernels.h"
-
-struct Atom {
-    float x;
-    float y;
-    float z;
-    float q;
-    float sig;
-    float eps;
-    float br;
-    float fx;
-    float fy;
-    float fz;
-    float fb;
-};
-
-static __constant__ cudaGmxSimulation cSim;
-
-void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-
-}
-
-void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-
-// Include versions of the kernel for N^2 calculations.
-
-#define METHOD_NAME(a, b) a##N2##b
-#include "kCalculateCDLJObcGbsaForces1.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateCDLJObcGbsaForces1.h"
-
-// Include versions of the kernel with cutoffs.
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_CUTOFF
-#define METHOD_NAME(a, b) a##Cutoff##b
-#include "kCalculateCDLJObcGbsaForces1.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##CutoffByWarp##b
-#include "kCalculateCDLJObcGbsaForces1.h"
-
-// Include versions of the kernel with periodic boundary conditions.
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_PERIODIC
-#define METHOD_NAME(a, b) a##Periodic##b
-#include "kCalculateCDLJObcGbsaForces1.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##PeriodicByWarp##b
-#include "kCalculateCDLJObcGbsaForces1.h"
-
-// Include versions of the kernels for Ewald
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_PERIODIC
-#define USE_EWALD
-#define METHOD_NAME(a, b) a##Ewald##b
-#include "kCalculateCDLJObcGbsaForces1.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##EwaldByWarp##b
-#include "kCalculateCDLJObcGbsaForces1.h"
-
-extern __global__ void kFindBlockBoundsCutoff_kernel();
-extern __global__ void kFindBlockBoundsPeriodic_kernel();
-extern __global__ void kFindBlocksWithInteractionsCutoff_kernel();
-extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
-extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*);
-extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*);
-extern __global__ void kCalculateEwaldFastCosSinSums_kernel();
-extern __global__ void kCalculateEwaldFastForces_kernel();
-extern void kCalculatePME(gpuContext gpu);
-
-void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
-{
-//    printf("kCalculateCDLJObcGbsaForces1\n");
-
-    switch (gpu->sim.nonbondedMethod)
-    {
-        case NO_CUTOFF:
-
-            if (gpu->bRecalculateBornRadii)
-            {
-                if( gpu->bIncludeGBVI ){
-                   kCalculateGBVIBornSum(gpu);
-                   kReduceGBVIBornSum(gpu);
-                } else {
-                   kCalculateObcGbsaBornSum(gpu);
-                   kReduceObcGbsaBornSum(gpu);
-                }
-
-            }
-
-            if (gpu->bOutputBufferPerWarp)
-                   kCalculateCDLJObcGbsaN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                           sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
-               else
-                   kCalculateCDLJObcGbsaN2Forces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                           sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
-
-            LAUNCHERROR("kCalculateCDLJObcGbsaN2Forces1");
-            break;
-
-        case CUTOFF:
-
-            kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
-            LAUNCHERROR("kFindBlockBoundsCutoff");
-            kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
-            LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
-            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-            kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-
-            if (gpu->bRecalculateBornRadii)
-            {
-                if( gpu->bIncludeGBVI ){
-                    kCalculateGBVIBornSum(gpu);
-                    kReduceGBVIBornSum(gpu);
-                } else {
-                    kCalculateObcGbsaBornSum(gpu);
-                    kReduceObcGbsaBornSum(gpu);
-                }
-            }
-
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJObcGbsaCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateCDLJObcGbsaCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-
-
-            LAUNCHERROR("kCalculateCDLJObcGbsaCutoffForces1");
-            break;
-
-        case PERIODIC:
-
-            kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
-            LAUNCHERROR("kFindBlockBoundsPeriodic");
-            kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
-            LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
-            compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-            kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                    sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-
-            if (gpu->bRecalculateBornRadii)
-            {
-                if( gpu->bIncludeGBVI ){
-                    kCalculateGBVIBornSum(gpu);
-                    kReduceGBVIBornSum(gpu);
-                } else {
-                    kCalculateObcGbsaBornSum(gpu);
-                    kReduceObcGbsaBornSum(gpu);
-                }
-            }
-
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateCDLJObcGbsaPeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateCDLJObcGbsaPeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-
-            LAUNCHERROR("kCalculateCDLJObcGbsaPeriodicForces1");
-
-            break;
-    }
-
-}
--- a/platforms/cuda-old/src/kernels/kCalculateCDLJObcGbsaForces1.h
+++ b/platforms/cuda-old/src/kernels/kCalculateCDLJObcGbsaForces1.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-/**
- * This file contains the kernel for evalauating nonbonded forces and the first stage of GBSA.
- * It is included several times in kCalculateCDLJObcGbsaForces1.cu with different #defines to generate
- * different versions of the kernels.
- */
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
-#endif
-void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit )
-{
-    extern __shared__ volatile Atom sA[];
-    unsigned int totalWarps   = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
-    unsigned int warp         = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    unsigned int numWorkUnits = cSim.pInteractionCount[0];
-    unsigned int pos          = warp*numWorkUnits/totalWarps;
-    unsigned int end          = (warp+1)*numWorkUnits/totalWarps;
-    float CDLJObcGbsa_energy;
-    float energy = 0.0f;
-#ifdef USE_CUTOFF
-    volatile float* tempBuffer = (volatile float*) &sA[cSim.nonbond_threads_per_block];
-#endif
-
-    unsigned int lasty        = -0xFFFFFFFF;
-    while (pos < end)
-    {
-
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x                      = workUnit[pos];
-        unsigned int y                      = ((x >> 2) & 0x7fff) << GRIDBITS;
-        bool bExclusionFlag                 = (x & 0x1);
-        x                                   = (x >> 17) << GRIDBITS;
-        unsigned int tgx                    = threadIdx.x & (GRID - 1);
-        unsigned int i                      = x + tgx;
-        float4 apos                         = cSim.pPosq[i];
-        float2 a                            = cSim.pAttr[i];
-        float br                            = cSim.pBornRadii[i];
-        unsigned int tbx                    = threadIdx.x - tgx;
-        unsigned int tj                     = tgx;
-        volatile Atom* psA                  = &sA[tbx];
-        float4 af;
-        af.x                        = 0.0f;
-        af.y                        = 0.0f;
-        af.z                        = 0.0f;
-        af.w                        = 0.0f;
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        {
-            // Read fixed atom data into registers and GRF
-            sA[threadIdx.x].x           = apos.x;
-            sA[threadIdx.x].y           = apos.y;
-            sA[threadIdx.x].z           = apos.z;
-            sA[threadIdx.x].q           = apos.w;
-            float q2                    = cSim.preFactor * apos.w;
-            apos.w                     *= cSim.epsfac;
-            sA[threadIdx.x].sig         = a.x;
-            sA[threadIdx.x].eps         = a.y;
-            sA[threadIdx.x].br          = br;
-            if (!bExclusionFlag)
-            {
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float dx                = psA[j].x - apos.x;
-                    float dy                = psA[j].y - apos.y;
-                    float dz                = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx                     -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy                     -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz                     -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    float r2                = dx * dx + dy * dy + dz * dz;
-
-                    // CDLJ part
-                    float invR              = 1.0f / sqrtf(r2);
-                    float sig               = a.x + psA[j].sig;
-                    float sig2              = invR * sig;
-                    sig2                   *= sig2;
-                    float sig6              = sig2 * sig2 * sig2;
-                    float eps               = a.y * psA[j].eps;
-                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
-                    CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
-#ifdef USE_CUTOFF
-                    dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-                    CDLJObcGbsa_energy     += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-#else
-                    float factorX           = apos.w * psA[j].q * invR;
-
-                    dEdR                   += factorX;
-                    CDLJObcGbsa_energy     += factorX;
-#endif
-                    dEdR                   *= invR * invR;
-
-                    // ObcGbsaForce1 part
-                    float alpha2_ij         = br * psA[j].br;
-                    float D_ij              = r2 / (4.0f * alpha2_ij);
-                    float expTerm           = expf(-D_ij);
-                    float denominator2      = r2 + alpha2_ij * expTerm;
-                    float denominator       = sqrtf(denominator2);
-                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
-                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
-                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
-                    CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
-
-#ifdef USE_CUTOFF
-                    if ( i >= cSim.atoms || (x+j) >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                    if ( i >= cSim.atoms || (x+j) >= cSim.atoms)
-#endif
-                    {
-                        dEdR                = 0.0f;
-                        dGpol_dalpha2_ij    = 0.0f;
-                        CDLJObcGbsa_energy  = 0.0f;
-                    }
-                    energy                 += 0.5f*CDLJObcGbsa_energy;
-                    // Add Forces
-                    dx                     *= dEdR;
-                    dy                     *= dEdR;
-                    dz                     *= dEdR;
-                    af.x                   -= dx;
-                    af.y                   -= dy;
-                    af.z                   -= dz;
-                    af.w                   += dGpol_dalpha2_ij * psA[j].br;
-
-                }
-
-            } else {
-
-                unsigned int xi   = x>>GRIDBITS;
-                unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
-                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float dx                = psA[j].x - apos.x;
-                    float dy                = psA[j].y - apos.y;
-                    float dz                = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx                     -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy                     -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz                     -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    float r2                = dx * dx + dy * dy + dz * dz;
-
-                    // CDLJ part
-                    float invR              = 1.0f / sqrtf(r2);
-                    float sig               = a.x + psA[j].sig;
-                    float sig2              = invR * sig;
-                    sig2                   *= sig2;
-                    float sig6              = sig2 * sig2 * sig2;
-                    float eps               = a.y * psA[j].eps;
-                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
-                    CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
-#ifdef USE_CUTOFF
-                    dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-                    CDLJObcGbsa_energy     += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-#else
-                    float factorX           = apos.w * psA[j].q * invR;
-
-                    dEdR                   += factorX;
-                    CDLJObcGbsa_energy     += factorX;
-#endif
-                    dEdR                   *= invR * invR;
-                    if (!(excl & 0x1))
-                    {
-                        dEdR                = 0.0f;
-		                  CDLJObcGbsa_energy  = 0.0f;
-                    }
-
-                    // ObcGbsaForce1 part
-                    float alpha2_ij         = br * psA[j].br;
-                    float D_ij              = r2 / (4.0f * alpha2_ij);
-                    float expTerm           = expf(-D_ij);
-                    float denominator2      = r2 + alpha2_ij * expTerm;
-                    float denominator       = sqrtf(denominator2);
-                    float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
-                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
-                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
-                    CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
-#if defined USE_CUTOFF
-                    if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                    if (i >= cSim.atoms || x+j >= cSim.atoms )
-#endif
-                    {
-                        dEdR                = 0.0f;
-                        dGpol_dalpha2_ij    = 0.0f;
-		                  CDLJObcGbsa_energy  = 0.0f;
-                    }
-                    energy                 += 0.5f*CDLJObcGbsa_energy;
-
-                    // Add Forces
-                    dx                     *= dEdR;
-                    dy                     *= dEdR;
-                    dz                     *= dEdR;
-                    af.x                   -= dx;
-                    af.y                   -= dy;
-                    af.z                   -= dz;
-                    af.w                   += dGpol_dalpha2_ij * psA[j].br;
-                    excl                  >>= 1;
-                }
-            }
-
-            // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset         = x + tgx + warp*cSim.stride;
-#else
-            unsigned int offset         = x + tgx + (x >> GRIDBITS) * cSim.stride;
-#endif
-            float4 of                   = cSim.pForce4[offset];
-            of.x                       += af.x;
-            of.y                       += af.y;
-            of.z                       += af.z;
-            of.w                       += af.w;
-            cSim.pForce4[offset]        = of;
-            cSim.pBornForce[offset]     = of.w;
-
-        } else {
-            // Read fixed atom data into registers and GRF
-            if (lasty != y)
-            {
-                unsigned int j              = y + tgx;
-                float4 temp                 = cSim.pPosq[j];
-                float2 temp1                = cSim.pAttr[j];
-                sA[threadIdx.x].br          = cSim.pBornRadii[j];
-                sA[threadIdx.x].x           = temp.x;
-                sA[threadIdx.x].y           = temp.y;
-                sA[threadIdx.x].z           = temp.z;
-                sA[threadIdx.x].q           = temp.w;
-                sA[threadIdx.x].sig         = temp1.x;
-                sA[threadIdx.x].eps         = temp1.y;
-            }
-            sA[threadIdx.x].fx          = 0.0f;
-            sA[threadIdx.x].fy          = 0.0f;
-            sA[threadIdx.x].fz          = 0.0f;
-            sA[threadIdx.x].fb          = 0.0f;
-            float q2                    = apos.w * cSim.preFactor;
-            apos.w                     *= cSim.epsfac;
-            if (!bExclusionFlag)
-            {
-#ifdef USE_CUTOFF
-                unsigned int flags = cSim.pInteractionFlag[pos];
-                if (flags == 0)
-                {
-                    // No interactions in this block.
-                }
-                else if (flags == 0xFFFFFFFF)
-#endif
-                {
-                    // Compute all interactions within this block.
-
-                    for (unsigned int j = 0; j < GRID; j++)
-                    {
-                        float dx                = psA[tj].x - apos.x;
-                        float dy                = psA[tj].y - apos.y;
-                        float dz                = psA[tj].z - apos.z;
-#ifdef USE_PERIODIC
-                        dx                     -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                        dy                     -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                        dz                     -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                        float r2                = dx * dx + dy * dy + dz * dz;
-
-                        // CDLJ part
-                        float invR              = 1.0f / sqrtf(r2);
-                        float sig               = a.x + psA[tj].sig;
-                        float sig2              = invR * sig;
-                        sig2                   *= sig2;
-                        float sig6              = sig2 * sig2 * sig2;
-                        float eps               = a.y * psA[tj].eps;
-                        float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
-                        CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
-#ifdef USE_CUTOFF
-                        dEdR                   += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-                        CDLJObcGbsa_energy     += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-#else
-                        float factorX           = apos.w * psA[tj].q * invR;
-
-                        dEdR                   += factorX;
-                        CDLJObcGbsa_energy     += factorX;
-#endif
-                        dEdR                   *= invR * invR;
-
-                        // ObcGbsaForce1 part
-                        float alpha2_ij         = br * psA[tj].br;
-                        float D_ij              = r2 / (4.0f * alpha2_ij);
-                        float expTerm           = expf(-D_ij);
-                        float denominator2      = r2 + alpha2_ij * expTerm;
-                        float denominator       = sqrtf(denominator2);
-                        float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
-                        float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
-                        dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
-                        CDLJObcGbsa_energy     += (q2 * psA[tj].q) / denominator;
-#ifdef USE_CUTOFF
-                        if ( i >= cSim.atoms || (y+tj) >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                        if ( i >= cSim.atoms || (y+tj) >= cSim.atoms)
-#endif
-                        {
-                            dEdR               = 0.0f;
-                            dGpol_dalpha2_ij   = 0.0f;
-       			             CDLJObcGbsa_energy = 0.0f;
-                        }
-                        energy                += CDLJObcGbsa_energy;
-
-                        // Add forces
-                        dx                     *= dEdR;
-                        dy                     *= dEdR;
-                        dz                     *= dEdR;
-                        af.x                   -= dx;
-                        af.y                   -= dy;
-                        af.z                   -= dz;
-                        af.w                   += dGpol_dalpha2_ij * psA[tj].br;
-                        psA[tj].fx             += dx;
-                        psA[tj].fy             += dy;
-                        psA[tj].fz             += dz;
-                        psA[tj].fb             += dGpol_dalpha2_ij * br;
-
-                        tj                      = (tj + 1) & (GRID - 1);
-
-
-                    }
-                }
-#ifdef USE_CUTOFF
-                else
-                {
-                    // Compute only a subset of the interactions in this block.
-
-                    for (unsigned int j = 0; j < GRID; j++)
-                    {
-                        if ((flags&(1<<j)) != 0)
-                        {
-                            float dx                = psA[j].x - apos.x;
-                            float dy                = psA[j].y - apos.y;
-                            float dz                = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                            dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                            dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                            dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                            float r2                = dx * dx + dy * dy + dz * dz;
-
-                            // CDLJ part
-                            float invR              = 1.0f / sqrtf(r2);
-                            float sig               = a.x + psA[j].sig;
-                            float sig2              = invR * sig;
-                            sig2                   *= sig2;
-                            float sig6              = sig2 * sig2 * sig2;
-                            float eps               = a.y * psA[j].eps;
-                            float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
-                            CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
-#ifdef USE_CUTOFF
-                            dEdR                   += apos.w * psA[j].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-                            CDLJObcGbsa_energy     += apos.w * psA[j].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-#else
-                            float factorX           = apos.w * psA[j].q * invR;
-
-                            dEdR                   += factorX;
-                            CDLJObcGbsa_energy     += factorX;
-#endif
-                            dEdR                   *= invR * invR;
-
-                            // ObcGbsaForce1 part
-                            float alpha2_ij         = br * psA[j].br;
-                            float D_ij              = r2 / (4.0f * alpha2_ij);
-                            float expTerm           = expf(-D_ij);
-                            float denominator2      = r2 + alpha2_ij * expTerm;
-                            float denominator       = sqrtf(denominator2);
-                            float Gpol              = (q2 * psA[j].q) / (denominator * denominator2);
-                            float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
-                            dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
-                            CDLJObcGbsa_energy     += (q2 * psA[j].q) / denominator;
-
-#ifdef USE_CUTOFF
-                            if ( i >= cSim.atoms || (y+j) >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                            if ( i >= cSim.atoms || (y+j) >= cSim.atoms)
-#endif
-                            {
-                                dEdR                = 0.0f;
-                                dGpol_dalpha2_ij    = 0.0f;
-				                    CDLJObcGbsa_energy  = 0.0f;
-                            }
-                            energy                 += CDLJObcGbsa_energy;
-                             
-                            // Add forces
-                            dx                     *= dEdR;
-                            dy                     *= dEdR;
-                            dz                     *= dEdR;
-                            af.x                   -= dx;
-                            af.y                   -= dy;
-                            af.z                   -= dz;
-                            af.w                   += dGpol_dalpha2_ij * psA[j].br;
-
-                            tempBuffer[threadIdx.x] = dx;
-                            if (tgx % 2 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
-                            if (tgx % 4 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
-                            if (tgx % 8 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
-                            if (tgx % 16 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
-                            if (tgx == 0)
-                                psA[j].fx += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
-
-                            tempBuffer[threadIdx.x] = dy;
-                            if (tgx % 2 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
-                            if (tgx % 4 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
-                            if (tgx % 8 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
-                            if (tgx % 16 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
-                            if (tgx == 0)
-                                psA[j].fy += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
-
-                            tempBuffer[threadIdx.x] = dz;
-                            if (tgx % 2 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
-                            if (tgx % 4 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
-                            if (tgx % 8 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
-                            if (tgx % 16 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
-                            if (tgx == 0)
-                                psA[j].fz += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
-
-                            // Sum the Born forces.
-
-                            tempBuffer[threadIdx.x] = dGpol_dalpha2_ij * br;
-                            if (tgx % 2 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
-                            if (tgx % 4 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
-                            if (tgx % 8 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
-                            if (tgx % 16 == 0)
-                                tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
-                            if (tgx == 0)
-                                psA[j].fb += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
-
-                        }
-                    }
-                }
-#endif
-            } else {
-                unsigned int xi   = x>>GRIDBITS;
-                unsigned int yi   = y>>GRIDBITS;
-                unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
-                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
-                excl              = (excl >> tgx) | (excl << (GRID - tgx));
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    float dx                = psA[tj].x - apos.x;
-                    float dy                = psA[tj].y - apos.y;
-                    float dz                = psA[tj].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx                     -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy                     -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz                     -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    float r2                = dx * dx + dy * dy + dz * dz;
-
-                    // CDLJ part
-                    float invR              = 1.0f / sqrtf(r2);
-                    float sig               = a.x + psA[tj].sig;
-                    float sig2              = invR * sig;
-                    sig2                   *= sig2;
-                    float sig6              = sig2 * sig2 * sig2;
-                    float eps               = a.y * psA[tj].eps;
-                    float dEdR              = eps * (12.0f * sig6 - 6.0f) * sig6;
-		              CDLJObcGbsa_energy      = eps * (sig6 - 1.0f) * sig6;
-#ifdef USE_CUTOFF
-                    dEdR                   += apos.w * psA[tj].q * (invR - 2.0f * cSim.reactionFieldK * r2);
-                    CDLJObcGbsa_energy     += apos.w * psA[tj].q * (invR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
-#else
-                    float factorX           = apos.w * psA[tj].q * invR;
-                    dEdR                   += factorX;
-                    CDLJObcGbsa_energy     += factorX;
-#endif
-                    dEdR                   *= invR * invR;
-                    if (!(excl & 0x1))
-                    {
-                        dEdR                = 0.0f;
-			               CDLJObcGbsa_energy  = 0.0f;
-                    }
-
-                    // ObcGbsaForce1 part
-                    float alpha2_ij         = br * psA[tj].br;
-                    float D_ij              = r2 / (4.0f * alpha2_ij);
-                    float expTerm           = expf(-D_ij);
-                    float denominator2      = r2 + alpha2_ij * expTerm;
-                    float denominator       = sqrtf(denominator2);
-                    float Gpol              = (q2 * psA[tj].q) / (denominator * denominator2);
-                    float dGpol_dalpha2_ij  = -0.5f * Gpol * expTerm * (1.0f + D_ij);
-                    dEdR                   += Gpol * (1.0f - 0.25f * expTerm);
-		              CDLJObcGbsa_energy     += (q2 * psA[tj].q) / denominator;
-
-#ifdef USE_CUTOFF
-                    if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
-#else
-                    if (i >= cSim.atoms || y+tj >= cSim.atoms )
-#endif
-                    {
-                        dEdR               = 0.0f;
-                        dGpol_dalpha2_ij   = 0.0f;
-			               CDLJObcGbsa_energy = 0.0f;
-                    }
-                    energy                += CDLJObcGbsa_energy;
-                     
-                    // Add forces
-                    dx                     *= dEdR;
-                    dy                     *= dEdR;
-                    dz                     *= dEdR;
-                    af.x                   -= dx;
-                    af.y                   -= dy;
-                    af.z                   -= dz;
-                    af.w                   += dGpol_dalpha2_ij * psA[tj].br;
-                    psA[tj].fx             += dx;
-                    psA[tj].fy             += dy;
-                    psA[tj].fz             += dz;
-                    psA[tj].fb             += dGpol_dalpha2_ij * br;
-                    excl                  >>= 1;
-
-                    tj                      = (tj + 1) & (GRID - 1);
-                }
-            }
-
-            // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset         = x + tgx + warp*cSim.stride;
-#else
-            unsigned int offset         = x + tgx + (y >> GRIDBITS) * cSim.stride;
-#endif
-            float4 of                   = cSim.pForce4[offset];
-            of.x                       += af.x;
-            of.y                       += af.y;
-            of.z                       += af.z;
-            of.w                       += af.w;
-            cSim.pForce4[offset]       = of;
-            cSim.pBornForce[offset]     = of.w;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            offset                      = y + tgx + warp*cSim.stride;
-#else
-            offset                      = y + tgx + (x >> GRIDBITS) * cSim.stride;
-#endif
-            of                          = cSim.pForce4[offset];
-            of.x                       += sA[threadIdx.x].fx;
-            of.y                       += sA[threadIdx.x].fy;
-            of.z                       += sA[threadIdx.x].fz;
-            of.w                       += sA[threadIdx.x].fb;
-            cSim.pForce4[offset]       = of;
-            cSim.pBornForce[offset]     = of.w;
-            lasty = y;
-        }
-        pos++;
-    }
-    cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
-}
--- a/platforms/cuda-old/src/kernels/kCalculateCMAPTorsionForces.cu
+++ b/platforms/cuda-old/src/kernels/kCalculateCMAPTorsionForces.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2010 Stanford University and the Authors.           *
- * Authors: Peter Eastman                                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-
-#include "gputypes.h"
-#include "cudatypes.h"
-
-
-#define LOCAL_HACK_PI 3.1415926535897932384626433832795
-
-#define DOT3(v1, v2) (v1.x*v2.x + v1.y*v2.y + v1.z*v2.z)
-
-#define CROSS_PRODUCT(v1, v2) make_float3(v1.y*v2.z - v1.z*v2.y, v1.z*v2.x - v1.x*v2.z, v1.x*v2.y - v1.y*v2.x)
-
-#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
-{ \
-    dp          = DOT3(v1, v2); \
-    float norm1 = DOT3(v1, v1); \
-    float norm2 = DOT3(v2, v2); \
-    dp /= sqrtf(norm1 * norm2); \
-    dp = min(dp, 1.0f); \
-    dp = max(dp, -1.0f); \
-}
-
-#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
-{ \
-    float dp; \
-    GETNORMEDDOTPRODUCT(v1, v2, dp); \
-    if (dp > 0.99f || dp < -0.99f) { \
-        float3 cross = CROSS_PRODUCT(v1, v2); \
-        float scale = DOT3(v1, v1)*DOT3(v2, v2); \
-        angle = asinf(sqrtf(DOT3(cross, cross)/scale)); \
-        if (dp < 0.0f) \
-            angle = LOCAL_HACK_PI-angle; \
-    } \
-    else { \
-        angle = acosf(dp); \
-    } \
-}
-
-#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
-{ \
-    cp0 = CROSS_PRODUCT(vector1, vector2); \
-    cp1 = CROSS_PRODUCT(vector2, vector3); \
-    GETANGLEBETWEENTWOVECTORS(cp0, cp1, angle); \
-    float dp = DOT3(signVector, cp1); \
-    angle = (dp >= 0) ? angle : -angle; \
-}
-
-__global__
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_LOCALFORCES_THREADS_PER_BLOCK, 1)
-#endif
-void kCalculateCMAPTorsionForces_kernel(int numAtoms, int numTorsions, float4* forceBuffers, float* energyBuffer,
-        float4* posq, float4* coeff, int2* mapPositions, int4* indices, int* maps)
-{
-    const float PI = 3.14159265358979323846f;
-    float energy = 0.0f;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numTorsions; index += blockDim.x*gridDim.x) {
-        int4 atoms1 = indices[4*index];
-        int4 atoms2 = indices[4*index+1];
-        int4 atoms3 = indices[4*index+2];
-        int4 atoms4 = indices[4*index+3];
-        float4 a1 = posq[atoms1.x];
-        float4 a2 = posq[atoms1.y];
-        float4 a3 = posq[atoms1.z];
-        float4 a4 = posq[atoms1.w];
-        float4 b1 = posq[atoms2.x];
-        float4 b2 = posq[atoms2.y];
-        float4 b3 = posq[atoms2.z];
-        float4 b4 = posq[atoms2.w];
-
-        // Compute the first angle.
-
-        float3 v0a = make_float3(a1.x-a2.x, a1.y-a2.y, a1.z-a2.z);
-        float3 v1a = make_float3(a3.x-a2.x, a3.y-a2.y, a3.z-a2.z);
-        float3 v2a = make_float3(a3.x-a4.x, a3.y-a4.y, a3.z-a4.z);
-        float3 cp0a, cp1a;
-        float angleA;
-        GETDIHEDRALANGLEBETWEENTHREEVECTORS(v0a, v1a, v2a, v0a, cp0a, cp1a, angleA);
-        angleA = fmod(angleA+2.0f*PI, 2.0f*PI);
-
-        // Compute the second angle.
-
-        float3 v0b = make_float3(b1.x-b2.x, b1.y-b2.y, b1.z-b2.z);
-        float3 v1b = make_float3(b3.x-b2.x, b3.y-b2.y, b3.z-b2.z);
-        float3 v2b = make_float3(b3.x-b4.x, b3.y-b4.y, b3.z-b4.z);
-        float3 cp0b, cp1b;
-        float angleB;
-        GETDIHEDRALANGLEBETWEENTHREEVECTORS(v0b, v1b, v2b, v0b, cp0b, cp1b, angleB);
-        angleB = fmod(angleB+2.0f*PI, 2.0f*PI);
-
-        // Identify which patch this is in.
-
-        int2 pos = mapPositions[maps[index]];
-        int size = pos.y;
-        float delta = 2.0f*PI/size;
-        int s = (int) (angleA/delta);
-        int t = (int) (angleB/delta);
-        float4 c[4];
-        int coeffIndex = pos.x+4*(s+size*t);
-        c[0] = coeff[coeffIndex];
-        c[1] = coeff[coeffIndex+1];
-        c[2] = coeff[coeffIndex+2];
-        c[3] = coeff[coeffIndex+3];
-        float da = angleA/delta-s;
-        float db = angleB/delta-t;
-
-        // Evaluate the spline to determine the energy and gradients.
-
-        float torsionEnergy = 0.0f;
-        float dEdA = 0.0f;
-        float dEdB = 0.0f;
-        torsionEnergy = da*torsionEnergy + ((c[3].w*db + c[3].z)*db + c[3].y)*db + c[3].x;
-        dEdA = db*dEdA + (3.0f*c[3].w*da + 2.0f*c[2].w)*da + c[1].w;
-        dEdB = da*dEdB + (3.0f*c[3].w*db + 2.0f*c[3].z)*db + c[3].y;
-        torsionEnergy = da*torsionEnergy + ((c[2].w*db + c[2].z)*db + c[2].y)*db + c[2].x;
-        dEdA = db*dEdA + (3.0f*c[3].z*da + 2.0f*c[2].z)*da + c[1].z;
-        dEdB = da*dEdB + (3.0f*c[2].w*db + 2.0f*c[2].z)*db + c[2].y;
-        torsionEnergy = da*torsionEnergy + ((c[1].w*db + c[1].z)*db + c[1].y)*db + c[1].x;
-        dEdA = db*dEdA + (3.0f*c[3].y*da + 2.0f*c[2].y)*da + c[1].y;
-        dEdB = da*dEdB + (3.0f*c[1].w*db + 2.0f*c[1].z)*db + c[1].y;
-        torsionEnergy = da*torsionEnergy + ((c[0].w*db + c[0].z)*db + c[0].y)*db + c[0].x;
-        dEdA = db*dEdA + (3.0f*c[3].x*da + 2.0f*c[2].x)*da + c[1].x;
-        dEdB = da*dEdB + (3.0f*c[0].w*db + 2.0f*c[0].z)*db + c[0].y;
-        dEdA /= delta;
-        dEdB /= delta;
-        energy += torsionEnergy;
-
-        // Apply the force to the first torsion.
-
-        float normCross1 = DOT3(cp0a, cp0a);
-        float normSqrBC = DOT3(v1a, v1a);
-        float normBC = sqrtf(normSqrBC);
-        float normCross2 = DOT3(cp1a, cp1a);
-        float dp = 1.0f/normSqrBC;
-        float4 ff = make_float4((-dEdA*normBC)/normCross1, DOT3(v0a, v1a)*dp, DOT3(v2a, v1a)*dp, (dEdA*normBC)/normCross2);
-        float3 internalF0 = make_float3(ff.x*cp0a.x, ff.x*cp0a.y, ff.x*cp0a.z);
-        float3 internalF3 = make_float3(ff.w*cp1a.x, ff.w*cp1a.y, ff.w*cp1a.z);
-        float3 d = make_float3(ff.y*internalF0.x - ff.z*internalF3.x,
-                               ff.y*internalF0.y - ff.z*internalF3.y,
-                               ff.y*internalF0.z - ff.z*internalF3.z);
-        unsigned int offsetA = atoms1.x+atoms3.x*numAtoms;
-        unsigned int offsetB = atoms1.y+atoms3.y*numAtoms;
-        unsigned int offsetC = atoms1.z+atoms3.z*numAtoms;
-        unsigned int offsetD = atoms1.w+atoms3.w*numAtoms;
-        float4 forceA = forceBuffers[offsetA];
-        float4 forceB = forceBuffers[offsetB];
-        float4 forceC = forceBuffers[offsetC];
-        float4 forceD = forceBuffers[offsetD];
-        forceA.x += internalF0.x;
-        forceA.y += internalF0.y;
-        forceA.z += internalF0.z;
-        forceB.x += d.x-internalF0.x;
-        forceB.y += d.y-internalF0.y;
-        forceB.z += d.z-internalF0.z;
-        forceC.x += -d.x-internalF3.x;
-        forceC.y += -d.y-internalF3.y;
-        forceC.z += -d.z-internalF3.z;
-        forceD.x += internalF3.x;
-        forceD.y += internalF3.y;
-        forceD.z += internalF3.z;
-        forceBuffers[offsetA] = forceA;
-        forceBuffers[offsetB] = forceB;
-        forceBuffers[offsetC] = forceC;
-        forceBuffers[offsetD] = forceD;
-
-        // Apply the force to the second torsion.
-
-        normCross1 = DOT3(cp0b, cp0b);
-        normSqrBC = DOT3(v1b, v1b);
-        normBC = sqrtf(normSqrBC);
-        normCross2 = DOT3(cp1b, cp1b);
-        dp = 1.0f/normSqrBC;
-        ff = make_float4((-dEdB*normBC)/normCross1, DOT3(v0b, v1b)*dp, DOT3(v2b, v1b)*dp, (dEdB*normBC)/normCross2);
-        internalF0 = make_float3(ff.x*cp0b.x, ff.x*cp0b.y, ff.x*cp0b.z);
-        internalF3 = make_float3(ff.w*cp1b.x, ff.w*cp1b.y, ff.w*cp1b.z);
-        d = make_float3(ff.y*internalF0.x - ff.z*internalF3.x,
-                        ff.y*internalF0.y - ff.z*internalF3.y,
-                        ff.y*internalF0.z - ff.z*internalF3.z);
-        offsetA = atoms2.x+atoms4.x*numAtoms;
-        offsetB = atoms2.y+atoms4.y*numAtoms;
-        offsetC = atoms2.z+atoms4.z*numAtoms;
-        offsetD = atoms2.w+atoms4.w*numAtoms;
-        forceA = forceBuffers[offsetA];
-        forceB = forceBuffers[offsetB];
-        forceC = forceBuffers[offsetC];
-        forceD = forceBuffers[offsetD];
-        forceA.x += internalF0.x;
-        forceA.y += internalF0.y;
-        forceA.z += internalF0.z;
-        forceB.x += d.x-internalF0.x;
-        forceB.y += d.y-internalF0.y;
-        forceB.z += d.z-internalF0.z;
-        forceC.x += -d.x-internalF3.x;
-        forceC.y += -d.y-internalF3.y;
-        forceC.z += -d.z-internalF3.z;
-        forceD.x += internalF3.x;
-        forceD.y += internalF3.y;
-        forceD.z += internalF3.z;
-        forceBuffers[offsetA] = forceA;
-        forceBuffers[offsetB] = forceB;
-        forceBuffers[offsetC] = forceC;
-        forceBuffers[offsetD] = forceD;
-    }
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
-}
-
-void kCalculateCMAPTorsionForces(gpuContext gpu, CUDAStream<float4>& coefficients, CUDAStream<int2>& mapPositions, CUDAStream<int4>& torsionIndices, CUDAStream<int>& torsionMaps)
-{
-    kCalculateCMAPTorsionForces_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block>>>(gpu->sim.stride,
-            torsionMaps._length, gpu->sim.pForce4, gpu->sim.pEnergy, gpu->sim.pPosq, coefficients._pDevData,
-            mapPositions._pDevData, torsionIndices._pDevData, torsionMaps._pDevData);
-    LAUNCHERROR("kCalculateCMAPTorsionForces");
-}
--- a/platforms/cuda-old/src/kernels/kCalculateCustomAngleForces.cu
+++ b/platforms/cuda-old/src/kernels/kCalculateCustomAngleForces.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2010 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-
-#include "gputypes.h"
-#include "cudatypes.h"
-
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ Expression<256> forceExp;
-static __constant__ Expression<256> energyExp;
-
-#include "kEvaluateExpression.h"
-
-void SetCalculateCustomAngleForcesSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-
-void GetCalculateCustomAngleForcesSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-
-void SetCustomAngleForceExpression(const Expression<256>& expression)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(forceExp, &expression, sizeof(forceExp));
-    RTERROR(status, "SetCustomAngleForceExpression: cudaMemcpyToSymbol failed");
-}
-
-void SetCustomAngleEnergyExpression(const Expression<256>& expression)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(energyExp, &expression, sizeof(energyExp));
-    RTERROR(status, "SetCustomAngleEnergyExpression: cudaMemcpyToSymbol failed");
-}
-
-void SetCustomAngleGlobalParams(const vector<float>& paramValues)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(globalParams, &paramValues[0], paramValues.size()*sizeof(float));
-    RTERROR(status, "SetCustomAngleGlobalParams: cudaMemcpyToSymbol failed");
-}
-
-#define DOT3(v1, v2) (v1.x*v2.x + v1.y*v2.y + v1.z*v2.z)
-
-#define CROSS_PRODUCT(v1, v2) make_float3(v1.y*v2.z - v1.z*v2.y, v1.z*v2.x - v1.x*v2.z, v1.x*v2.y - v1.y*v2.x)
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(1024, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(512, 1)
-#else
-__launch_bounds__(256, 1)
-#endif
-void kCalculateCustomAngleForces_kernel()
-{
-    extern __shared__ float stack[];
-    float* variables = (float*) &stack[cSim.customExpressionStackSize*blockDim.x];
-    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    float totalEnergy = 0.0f;
-
-    while (pos < cSim.customAngles)
-    {
-        int4 atom       = cSim.pCustomAngleID1[pos];
-        int2 atom2      = cSim.pCustomAngleID2[pos];
-        float4 params   = cSim.pCustomAngleParams[pos];
-        float4 a1       = cSim.pPosq[atom.x];
-        float4 a2       = cSim.pPosq[atom.y];
-        float4 a3       = cSim.pPosq[atom.z];
-        float3 v0 = make_float3(a2.x-a1.x, a2.y-a1.y, a2.z-a1.z);
-        float3 v1 = make_float3(a2.x-a3.x, a2.y-a3.y, a2.z-a3.z);
-        float3 cp = CROSS_PRODUCT(v0, v1);
-        float rp = DOT3(cp, cp);
-        rp = max(sqrtf(rp), 1.0e-06f);
-        float r21 = DOT3(v0, v0);
-        float r23 = DOT3(v1, v1);
-        float dot = DOT3(v0, v1);
-        float cosine = max(-1.0f, min(1.0f, dot/sqrtf(r21*r23)));
-        VARIABLE(0) = acosf(cosine);
-        VARIABLE(1) = params.x;
-        VARIABLE(2) = params.y;
-        VARIABLE(3) = params.z;
-        VARIABLE(4) = params.w;
-        float dEdR = kEvaluateExpression_kernel(&forceExp, stack, variables);
-        totalEnergy += kEvaluateExpression_kernel(&energyExp, stack, variables);
-        float termA =  dEdR/(r21*rp);
-        float termC = -dEdR/(r23*rp);
-        float3 c21 = CROSS_PRODUCT(v0, cp);
-        float3 c23 = CROSS_PRODUCT(v1, cp);
-        c21.x *= termA;
-        c21.y *= termA;
-        c21.z *= termA;
-        c23.x *= termC;
-        c23.y *= termC;
-        c23.z *= termC;
-        unsigned int offsetA  = atom.x + atom.w * cSim.stride;
-        unsigned int offsetB  = atom.y + atom2.x * cSim.stride;
-        unsigned int offsetC  = atom.z + atom2.y * cSim.stride;
-        float4 forceA         = cSim.pForce4[offsetA];
-        float4 forceB         = cSim.pForce4[offsetB];
-        float4 forceC         = cSim.pForce4[offsetC];
-        forceA.x             += c21.x;
-        forceA.y             += c21.y;
-        forceA.z             += c21.z;
-        forceB.x             -= c21.x+c23.x;
-        forceB.y             -= c21.y+c23.y;
-        forceB.z             -= c21.z+c23.z;
-        forceC.x             += c23.x;
-        forceC.y             += c23.y;
-        forceC.z             += c23.z;
-        cSim.pForce4[offsetA] = forceA;
-        cSim.pForce4[offsetB] = forceB;
-        cSim.pForce4[offsetC] = forceC;
-        pos                  += blockDim.x * gridDim.x;
-    }
-    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy;
-}
-
-void kCalculateCustomAngleForces(gpuContext gpu)
-{
-//    printf("kCalculateCustomAngleForces\n");
-    int memoryPerThread = (gpu->sim.customExpressionStackSize+9)*sizeof(float);
-    int maxThreads = (gpu->sharedMemoryPerBlock-16)/memoryPerThread;
-    int threads = min(gpu->sim.localForces_threads_per_block, (maxThreads/64)*64);
-    kCalculateCustomAngleForces_kernel<<<gpu->sim.blocks, threads, memoryPerThread*threads>>>();
-    LAUNCHERROR("kCalculateCustomAngleForces");
-}
--- a/platforms/cuda-old/src/kernels/kCalculateCustomBondForces.cu
+++ b/platforms/cuda-old/src/kernels/kCalculateCustomBondForces.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-
-#include "gputypes.h"
-#include "cudatypes.h"
-
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ Expression<256> forceExp;
-static __constant__ Expression<256> energyExp;
-
-#include "kEvaluateExpression.h"
-
-void SetCalculateCustomBondForcesSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-
-void GetCalculateCustomBondForcesSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-
-void SetCustomBondForceExpression(const Expression<256>& expression)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(forceExp, &expression, sizeof(forceExp));
-    RTERROR(status, "SetCustomBondForceExpression: cudaMemcpyToSymbol failed");
-}
-
-void SetCustomBondEnergyExpression(const Expression<256>& expression)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(energyExp, &expression, sizeof(energyExp));
-    RTERROR(status, "SetCustomBondEnergyExpression: cudaMemcpyToSymbol failed");
-}
-
-void SetCustomBondGlobalParams(const vector<float>& paramValues)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(globalParams, &paramValues[0], paramValues.size()*sizeof(float));
-    RTERROR(status, "SetCustomBondGlobalParams: cudaMemcpyToSymbol failed");
-}
-
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(1024, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(512, 1)
-#else
-__launch_bounds__(256, 1)
-#endif
-void kCalculateCustomBondForces_kernel()
-{
-    extern __shared__ float stack[];
-    float* variables = (float*) &stack[cSim.customExpressionStackSize*blockDim.x];
-    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    float totalEnergy = 0.0f;
-
-    while (pos < cSim.customBonds)
-    {
-        int4 atom       = cSim.pCustomBondID[pos];
-        float4 params   = cSim.pCustomBondParams[pos];
-        float4 a1       = cSim.pPosq[atom.x];
-        float4 a2       = cSim.pPosq[atom.y];
-        float dx        = a1.x - a2.x;
-        float dy        = a1.y - a2.y;
-        float dz        = a1.z - a2.z;
-        float r         = sqrtf(dx*dx + dy*dy + dz*dz);
-        float invR      = 1.0f/r;
-        VARIABLE(0)     = r;
-        VARIABLE(1)     = params.x;
-        VARIABLE(2)     = params.y;
-        VARIABLE(3)     = params.z;
-        VARIABLE(4)     = params.w;
-        float dEdR      = -kEvaluateExpression_kernel(&forceExp, stack, variables)*invR;
-        float energy    = kEvaluateExpression_kernel(&energyExp, stack, variables);
-        totalEnergy          += energy;
-        dx                   *= dEdR;
-        dy                   *= dEdR;
-        dz                   *= dEdR;
-        unsigned int offsetA  = atom.x + atom.z * cSim.stride;
-        unsigned int offsetB  = atom.y + atom.w * cSim.stride;
-        float4 forceA         = cSim.pForce4[offsetA];
-        float4 forceB         = cSim.pForce4[offsetB];
-        forceA.x             += dx;
-        forceA.y             += dy;
-        forceA.z             += dz;
-        forceB.x             -= dx;
-        forceB.y             -= dy;
-        forceB.z             -= dz;
-        cSim.pForce4[offsetA] = forceA;
-        cSim.pForce4[offsetB] = forceB;
-        pos                  += blockDim.x * gridDim.x;
-    }
-    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy;
-}
-
-void kCalculateCustomBondForces(gpuContext gpu)
-{
-//    printf("kCalculateCustomBondForces\n");
-    int memoryPerThread = (gpu->sim.customExpressionStackSize+9)*sizeof(float);
-    int maxThreads = (gpu->sharedMemoryPerBlock-16)/memoryPerThread;
-    int threads = min(gpu->sim.localForces_threads_per_block, (maxThreads/64)*64);
-    kCalculateCustomBondForces_kernel<<<gpu->sim.blocks, threads, memoryPerThread*threads>>>();
-    LAUNCHERROR("kCalculateCustomBondForces");
-}
--- a/platforms/cuda-old/src/kernels/kCalculateCustomExternalForces.cu
+++ b/platforms/cuda-old/src/kernels/kCalculateCustomExternalForces.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-
-#include "gputypes.h"
-#include "cudatypes.h"
-
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ Expression<256> forceExpX;
-static __constant__ Expression<256> forceExpY;
-static __constant__ Expression<256> forceExpZ;
-static __constant__ Expression<256> energyExp;
-
-#include "kEvaluateExpression.h"
-
-void SetCalculateCustomExternalForcesSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-
-void GetCalculateCustomExternalForcesSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-
-void SetCustomExternalForceExpressions(const Expression<256>& expressionX, const Expression<256>& expressionY, const Expression<256>& expressionZ)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(forceExpX, &expressionX, sizeof(forceExpX));
-    status = cudaMemcpyToSymbol(forceExpY, &expressionY, sizeof(forceExpY));
-    status = cudaMemcpyToSymbol(forceExpZ, &expressionZ, sizeof(forceExpZ));
-    RTERROR(status, "SetCustomExternalForceExpression: cudaMemcpyToSymbol failed");
-}
-
-void SetCustomExternalEnergyExpression(const Expression<256>& expression)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(energyExp, &expression, sizeof(energyExp));
-    RTERROR(status, "SetCustomExternalEnergyExpression: cudaMemcpyToSymbol failed");
-}
-
-void SetCustomExternalGlobalParams(const vector<float>& paramValues)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(globalParams, &paramValues[0], paramValues.size()*sizeof(float));
-    RTERROR(status, "SetCustomExternalGlobalParams: cudaMemcpyToSymbol failed");
-}
-
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(1024, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(512, 1)
-#else
-__launch_bounds__(256, 1)
-#endif
-void kCalculateCustomExternalForces_kernel()
-{
-    extern __shared__ float stack[];
-    float* variables = (float*) &stack[cSim.customExpressionStackSize*blockDim.x];
-    unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
-    float totalEnergy = 0.0f;
-
-    while (index < cSim.customExternals)
-    {
-        int atom        = cSim.pCustomExternalID[index];
-        float4 params   = cSim.pCustomExternalParams[index];
-        float4 pos      = cSim.pPosq[atom];
-        VARIABLE(0)     = pos.x;
-        VARIABLE(1)     = pos.y;
-        VARIABLE(2)     = pos.z;
-        VARIABLE(3)     = params.x;
-        VARIABLE(4)     = params.y;
-        VARIABLE(5)     = params.z;
-        VARIABLE(6)     = params.w;
-        totalEnergy       += kEvaluateExpression_kernel(&energyExp, stack, variables);;
-        float4 force       = cSim.pForce4[atom];
-        force.x           -= kEvaluateExpression_kernel(&forceExpX, stack, variables);
-        force.y           -= kEvaluateExpression_kernel(&forceExpY, stack, variables);
-        force.z           -= kEvaluateExpression_kernel(&forceExpZ, stack, variables);
-        cSim.pForce4[atom] = force;
-        index             += blockDim.x * gridDim.x;
-    }
-    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy;
-}
-
-void kCalculateCustomExternalForces(gpuContext gpu)
-{
-//    printf("kCalculateCustomExternalForces\n");
-    int memoryPerThread = (gpu->sim.customExpressionStackSize+9)*sizeof(float);
-    int maxThreads = (gpu->sharedMemoryPerBlock-16)/memoryPerThread;
-    int threads = min(gpu->sim.localForces_threads_per_block, (maxThreads/64)*64);
-    kCalculateCustomExternalForces_kernel<<<gpu->sim.blocks, threads, memoryPerThread*threads>>>();
-    LAUNCHERROR("kCalculateCustomExternalForces");
-}
--- a/platforms/cuda-old/src/kernels/kCalculateCustomNonbondedForces.cu
+++ b/platforms/cuda-old/src/kernels/kCalculateCustomNonbondedForces.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-
-#include "gputypes.h"
-#include "cudatypes.h"
-
-#define UNROLLXX 0
-#define UNROLLXY 0
-
-struct Atom {
-    float x;
-    float y;
-    float z;
-    float4 params;
-    float fx;
-    float fy;
-    float fz;
-};
-
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ Expression<256> forceExp;
-static __constant__ Expression<256> energyExp;
-
-#include "kEvaluateExpression.h"
-
-void SetCalculateCustomNonbondedForcesSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-
-void GetCalculateCustomNonbondedForcesSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-
-void SetCustomNonbondedForceExpression(const Expression<256>& expression)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(forceExp, &expression, sizeof(forceExp));
-    RTERROR(status, "SetCustomNonbondedForceExpression: cudaMemcpyToSymbol failed");
-}
-
-void SetCustomNonbondedEnergyExpression(const Expression<256>& expression)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(energyExp, &expression, sizeof(energyExp));
-    RTERROR(status, "SetCustomNonbondedEnergyExpression: cudaMemcpyToSymbol failed");
-}
-
-void SetCustomNonbondedGlobalParams(const vector<float>& paramValues)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(globalParams, &paramValues[0], paramValues.size()*sizeof(float));
-    RTERROR(status, "SetCustomNonbondedGlobalParams: cudaMemcpyToSymbol failed");
-}
-
-// Include versions of the kernels for N^2 calculations.
-
-#define METHOD_NAME(a, b) a##N2##b
-#include "kCalculateCustomNonbondedForces.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateCustomNonbondedForces.h"
-
-// Include versions of the kernels with cutoffs.
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_CUTOFF
-#define METHOD_NAME(a, b) a##Cutoff##b
-#include "kCalculateCustomNonbondedForces.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##CutoffByWarp##b
-#include "kCalculateCustomNonbondedForces.h"
-
-// Include versions of the kernels with periodic boundary conditions.
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_PERIODIC
-#define METHOD_NAME(a, b) a##Periodic##b
-#include "kCalculateCustomNonbondedForces.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##PeriodicByWarp##b
-#include "kCalculateCustomNonbondedForces.h"
-
-__global__ void kFindBlockBoundsCutoff_kernel();
-__global__ void kFindBlocksWithInteractionsCutoff_kernel();
-__global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int* workUnit);
-__global__ void kFindBlockBoundsPeriodic_kernel();
-__global__ void kFindBlocksWithInteractionsPeriodic_kernel();
-__global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int* workUnit);
-
-void kCalculateCustomNonbondedForces(gpuContext gpu, bool neighborListValid)
-{
-//    printf("kCalculateCustomNonbondedCutoffForces\n");
-    if (gpu->tabulatedFunctionsChanged)
-    {
-        cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>();
-        if (gpu->tabulatedFunctions[0].coefficients != NULL)
-            cudaBindTexture(NULL, &texRef0, gpu->tabulatedFunctions[0].coefficients->_pDevData, &channelDesc, gpu->tabulatedFunctions[0].coefficients->_length*sizeof(float4));
-        if (gpu->tabulatedFunctions[1].coefficients != NULL)
-            cudaBindTexture(NULL, &texRef1, gpu->tabulatedFunctions[1].coefficients->_pDevData, &channelDesc, gpu->tabulatedFunctions[1].coefficients->_length*sizeof(float4));
-        if (gpu->tabulatedFunctions[2].coefficients != NULL)
-            cudaBindTexture(NULL, &texRef2, gpu->tabulatedFunctions[2].coefficients->_pDevData, &channelDesc, gpu->tabulatedFunctions[2].coefficients->_length*sizeof(float4));
-        if (gpu->tabulatedFunctions[3].coefficients != NULL)
-            cudaBindTexture(NULL, &texRef3, gpu->tabulatedFunctions[3].coefficients->_pDevData, &channelDesc, gpu->tabulatedFunctions[3].coefficients->_length*sizeof(float4));
-        gpu->tabulatedFunctionsChanged = false;
-    }
-    int sharedPerThread = sizeof(Atom)+gpu->sim.customExpressionStackSize*sizeof(float)+9*sizeof(float);
-    if (gpu->sim.customNonbondedMethod != NO_CUTOFF)
-        sharedPerThread += sizeof(float3);
-    int threads = gpu->sim.nonbond_threads_per_block;
-    int maxThreads = (gpu->sharedMemoryPerBlock-16)/sharedPerThread;
-    if (threads > maxThreads)
-        threads = (maxThreads/32)*32;
-    switch (gpu->sim.customNonbondedMethod)
-    {
-        case NO_CUTOFF:
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateCustomNonbondedN2ByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threads, sharedPerThread*threads>>>(gpu->sim.pWorkUnit);
-            else
-                kCalculateCustomNonbondedN2Forces_kernel<<<gpu->sim.nonbond_blocks, threads, sharedPerThread*threads>>>(gpu->sim.pWorkUnit);
-            LAUNCHERROR("kCalculateCustomNonbondedN2Forces");
-            break;
-        case CUTOFF:
-            if (!neighborListValid)
-            {
-                kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
-                LAUNCHERROR("kFindBlockBoundsCutoff");
-                kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
-                LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
-                compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-                kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            }
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateCustomNonbondedCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threads, sharedPerThread*threads>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateCustomNonbondedCutoffForces_kernel<<<gpu->sim.nonbond_blocks, threads, sharedPerThread*threads>>>(gpu->sim.pInteractingWorkUnit);
-            LAUNCHERROR("kCalculateCustomNonbondedCutoffForces");
-            break;
-        case PERIODIC:
-            if (!neighborListValid)
-            {
-                kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
-                LAUNCHERROR("kFindBlockBoundsPeriodic");
-                kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
-                LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
-                compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
-                kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            }
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateCustomNonbondedPeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, threads, sharedPerThread*threads>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateCustomNonbondedPeriodicForces_kernel<<<gpu->sim.nonbond_blocks, threads, sharedPerThread*threads>>>(gpu->sim.pInteractingWorkUnit);
-            LAUNCHERROR("kCalculateCustomNonbondedPeriodicForces");
-            break;
-    }
-}
--- a/platforms/cuda-old/src/kernels/kCalculateCustomNonbondedForces.h
+++ b/platforms/cuda-old/src/kernels/kCalculateCustomNonbondedForces.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-/**
- * This file contains the kernels for evalauating custom nonbonded forces.  It is included
- * several times in kCalculateCustomNonbondedForces.cu with different #defines to generate
- * different versions of the kernels.
- */
-
-__global__ void METHOD_NAME(kCalculateCustomNonbonded, Forces_kernel)(unsigned int* workUnit)
-{
-    extern __shared__ float stack[];
-    volatile Atom* sA = (volatile Atom*) &stack[cSim.customExpressionStackSize*blockDim.x];
-    float* variables = (float*) &sA[blockDim.x];
-    unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    unsigned int numWorkUnits = cSim.pInteractionCount[0];
-    unsigned int pos = warp*numWorkUnits/totalWarps;
-    unsigned int end = (warp+1)*numWorkUnits/totalWarps;
-    float totalEnergy = 0.0f;
-#ifdef USE_CUTOFF
-    volatile float3* tempBuffer = (volatile float3*) &variables[9*blockDim.x];
-#endif
-
-    unsigned int lasty = 0xFFFFFFFF;
-    while (pos < end)
-    {
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x = workUnit[pos];
-        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
-        bool bExclusionFlag = (x & 0x1);
-        x = (x >> 17) << GRIDBITS;
-        float4      apos;   // Local atom x, y, z, q
-        float3      af;     // Local atom fx, fy, fz
-        unsigned int tgx = threadIdx.x & (GRID - 1);
-        unsigned int tbx = threadIdx.x - tgx;
-        unsigned int tj = tgx;
-        volatile Atom* psA = &sA[tbx];
-        unsigned int i      = x + tgx;
-        apos                = cSim.pPosq[i];
-        float4 params       = cSim.pCustomParams[i];
-        af.x                = 0.0f;
-        af.y                = 0.0f;
-        af.z                = 0.0f;
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        {
-            // Read fixed atom data into registers and GRF
-            sA[threadIdx.x].x   = apos.x;
-            sA[threadIdx.x].y   = apos.y;
-            sA[threadIdx.x].z   = apos.z;
-            sA[threadIdx.x].params.x = params.x;
-            sA[threadIdx.x].params.y = params.y;
-            sA[threadIdx.x].params.z = params.z;
-            sA[threadIdx.x].params.w = params.w;
-            unsigned int xi   = x>>GRIDBITS;
-            unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
-            unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
-            for (unsigned int j = 0; j < GRID; j++)
-            {
-                // Record the parameters.
-
-                VARIABLE(0) = params.x;
-                VARIABLE(1) = params.y;
-                VARIABLE(2) = params.z;
-                VARIABLE(3) = params.w;
-                VARIABLE(4) = psA[j].params.x;
-                VARIABLE(5) = psA[j].params.y;
-                VARIABLE(6) = psA[j].params.z;
-                VARIABLE(7) = psA[j].params.w;
-
-                // Compute the force.
-
-                float dx        = psA[j].x - apos.x;
-                float dy        = psA[j].y - apos.y;
-                float dz        = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                float r         = sqrtf(dx*dx + dy*dy + dz*dz);
-                float invR      = 1.0f/r;
-                VARIABLE(8)     = r;
-                float dEdR      = -kEvaluateExpression_kernel(&forceExp, stack, variables)*invR;
-                float energy    = kEvaluateExpression_kernel(&energyExp, stack, variables);
-#ifdef USE_CUTOFF
-                if (!(excl & 0x1) || r > cSim.nonbondedCutoff)
-#else
-                if (!(excl & 0x1))
-#endif
-                {
-                    dEdR = 0.0f;
-                    energy = 0.0f;
-                }
-                totalEnergy    += 0.5f*energy;
-                dx             *= dEdR;
-                dy             *= dEdR;
-                dz             *= dEdR;
-                af.x           -= dx;
-                af.y           -= dy;
-                af.z           -= dz;
-                excl          >>= 1;
-            }
-
-            // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                 = x + tgx + warp*cSim.stride;
-#else
-            unsigned int offset                 = x + tgx + (x >> GRIDBITS) * cSim.stride;
-#endif
-            float4 of                           = cSim.pForce4[offset];
-            of.x                               += af.x;
-            of.y                               += af.y;
-            of.z                               += af.z;
-            cSim.pForce4[offset]               = of;
-
-        }
-        else        // 100% utilization
-        {
-            // Read fixed atom data into registers and GRF
-            if (lasty != y)
-            {
-                unsigned int j                   = y + tgx;
-                float4 temp             = cSim.pPosq[j];
-                sA[threadIdx.x].x       = temp.x;
-                sA[threadIdx.x].y       = temp.y;
-                sA[threadIdx.x].z       = temp.z;
-                sA[threadIdx.x].params.x = cSim.pCustomParams[j].x;
-                sA[threadIdx.x].params.y = cSim.pCustomParams[j].y;
-                sA[threadIdx.x].params.z = cSim.pCustomParams[j].z;
-                sA[threadIdx.x].params.w = cSim.pCustomParams[j].w;
-            }
-            sA[threadIdx.x].fx      = 0.0f;
-            sA[threadIdx.x].fy      = 0.0f;
-            sA[threadIdx.x].fz      = 0.0f;
-            if (!bExclusionFlag)
-            {
-#ifdef USE_CUTOFF
-                unsigned int flags = cSim.pInteractionFlag[pos];
-                if (flags == 0)
-                {
-                    // No interactions in this block.
-                }
-                else if (flags == 0xFFFFFFFF)
-#endif
-                {
-                    // Compute all interactions within this block.
-
-                    for (unsigned int j = 0; j < GRID; j++)
-                    {
-                        // Record the parameters.
-
-                        VARIABLE(0) = params.x;
-                        VARIABLE(1) = params.y;
-                        VARIABLE(2) = params.z;
-                        VARIABLE(3) = params.w;
-                        VARIABLE(4) = psA[tj].params.x;
-                        VARIABLE(5) = psA[tj].params.y;
-                        VARIABLE(6) = psA[tj].params.z;
-                        VARIABLE(7) = psA[tj].params.w;
-
-                        // Compute the force.
-
-                        float dx        = psA[tj].x - apos.x;
-                        float dy        = psA[tj].y - apos.y;
-                        float dz        = psA[tj].z - apos.z;
-#ifdef USE_PERIODIC
-                        dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                        dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                        dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                        float r         = sqrtf(dx*dx + dy*dy + dz*dz);
-                        float invR      = 1.0f/r;
-                        VARIABLE(8)     = r;
-                        float dEdR      = -kEvaluateExpression_kernel(&forceExp, stack, variables)*invR;
-                        float energy    = kEvaluateExpression_kernel(&energyExp, stack, variables);
-#ifdef USE_CUTOFF
-                        if (r > cSim.nonbondedCutoff)
-                        {
-                            dEdR = 0.0f;
-                            energy = 0.0f;
-                        }
-#endif
-                        totalEnergy    += energy;
-                        dx             *= dEdR;
-                        dy             *= dEdR;
-                        dz             *= dEdR;
-                        af.x           -= dx;
-                        af.y           -= dy;
-                        af.z           -= dz;
-                        psA[tj].fx     += dx;
-                        psA[tj].fy     += dy;
-                        psA[tj].fz     += dz;
-                        tj              = (tj + 1) & (GRID - 1);
-                    }
-                }
-#ifdef USE_CUTOFF
-                else
-                {
-                    // Compute only a subset of the interactions in this block.
-
-                    for (unsigned int j = 0; j < GRID; j++)
-                    {
-                        if ((flags&(1<<j)) != 0)
-                        {
-                            // Record the parameters.
-
-                            VARIABLE(0) = params.x;
-                            VARIABLE(1) = params.y;
-                            VARIABLE(2) = params.z;
-                            VARIABLE(3) = params.w;
-                            VARIABLE(4) = psA[j].params.x;
-                            VARIABLE(5) = psA[j].params.y;
-                            VARIABLE(6) = psA[j].params.z;
-                            VARIABLE(7) = psA[j].params.w;
-
-                            // Compute the force.
-
-                            float dx        = psA[j].x - apos.x;
-                            float dy        = psA[j].y - apos.y;
-                            float dz        = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                            dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                            dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                            dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                            float r         = sqrtf(dx*dx + dy*dy + dz*dz);
-                            float invR      = 1.0f/r;
-                            VARIABLE(8)     = r;
-                            float dEdR      = -kEvaluateExpression_kernel(&forceExp, stack, variables)*invR;
-                            float energy    = kEvaluateExpression_kernel(&energyExp, stack, variables);
-#ifdef USE_CUTOFF
-                            if (r > cSim.nonbondedCutoff)
-                            {
-                                dEdR = 0.0f;
-                                energy = 0.0f;
-                            }
-#endif
-                            totalEnergy    += energy;
-                            dx             *= dEdR;
-                            dy             *= dEdR;
-                            dz             *= dEdR;
-                            af.x           -= dx;
-                            af.y           -= dy;
-                            af.z           -= dz;
-                            tempBuffer[threadIdx.x].x = dx;
-                            tempBuffer[threadIdx.x].y = dy;
-                            tempBuffer[threadIdx.x].z = dz;
-
-                            // Sum the forces on atom j.
-
-                            if (tgx % 2 == 0)
-                            {
-                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
-                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
-                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
-                            }
-                            if (tgx % 4 == 0)
-                            {
-                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
-                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
-                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
-                            }
-                            if (tgx % 8 == 0)
-                            {
-                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
-                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
-                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
-                            }
-                            if (tgx % 16 == 0)
-                            {
-                                tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
-                                tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
-                                tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
-                            }
-                            if (tgx == 0)
-                            {
-                                psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
-                                psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
-                                psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
-                            }
-                        }
-                    }
-                }
-#endif
-            }
-            else  // bExclusion
-            {
-                // Read fixed atom data into registers and GRF
-                unsigned int xi   = x>>GRIDBITS;
-                unsigned int yi   = y>>GRIDBITS;
-                unsigned int cell          = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
-                unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
-                excl              = (excl >> tgx) | (excl << (GRID - tgx));
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    // Record the parameters.
-
-                    VARIABLE(0) = params.x;
-                    VARIABLE(1) = params.y;
-                    VARIABLE(2) = params.z;
-                    VARIABLE(3) = params.w;
-                    VARIABLE(4) = psA[tj].params.x;
-                    VARIABLE(5) = psA[tj].params.y;
-                    VARIABLE(6) = psA[tj].params.z;
-                    VARIABLE(7) = psA[tj].params.w;
-
-                    // Compute the force.
-
-                    float dx        = psA[tj].x - apos.x;
-                    float dy        = psA[tj].y - apos.y;
-                    float dz        = psA[tj].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    float r         = sqrtf(dx*dx + dy*dy + dz*dz);
-                    float invR      = 1.0f/r;
-                    VARIABLE(8)     = r;
-                    float dEdR      = -kEvaluateExpression_kernel(&forceExp, stack, variables)*invR;
-                    float energy    = kEvaluateExpression_kernel(&energyExp, stack, variables);
-#ifdef USE_CUTOFF
-                    if (!(excl & 0x1) || r > cSim.nonbondedCutoff)
-#else
-                    if (!(excl & 0x1))
-#endif
-                    {
-                        dEdR = 0.0f;
-                        energy = 0.0f;
-                    }
-                    totalEnergy    += energy;
-                    dx             *= dEdR;
-                    dy             *= dEdR;
-                    dz             *= dEdR;
-                    af.x           -= dx;
-                    af.y           -= dy;
-                    af.z           -= dz;
-                    psA[tj].fx     += dx;
-                    psA[tj].fy     += dy;
-                    psA[tj].fz     += dz;
-                    excl          >>= 1;
-                    tj              = (tj + 1) & (GRID - 1);
-                }
-            }
-
-            // Write results
-            float4 of;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset                 = x + tgx + warp*cSim.stride;
-#else
-            unsigned int offset                 = x + tgx + (y >> GRIDBITS) * cSim.stride;
-#endif
-            of                                  = cSim.pForce4[offset];
-            of.x                               += af.x;
-            of.y                               += af.y;
-            of.z                               += af.z;
-            cSim.pForce4[offset]               = of;
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            offset                              = y + tgx + warp*cSim.stride;
-#else
-            offset                              = y + tgx + (x >> GRIDBITS) * cSim.stride;
-#endif
-            of                                  = cSim.pForce4[offset];
-            of.x                               += sA[threadIdx.x].fx;
-            of.y                               += sA[threadIdx.x].fy;
-            of.z                               += sA[threadIdx.x].fz;
-            cSim.pForce4[offset]               = of;
-            lasty = y;
-        }
-
-        pos++;
-    }
-    cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += totalEnergy;
-}
--- a/platforms/cuda-old/src/kernels/kCalculateCustomTorsionForces.cu
+++ b/platforms/cuda-old/src/kernels/kCalculateCustomTorsionForces.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2010 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * This program is free software: you can redistribute it and/or modify       *
- * it under the terms of the GNU Lesser General Public License as published   *
- * by the Free Software Foundation, either version 3 of the License, or       *
- * (at your option) any later version.                                        *
- *                                                                            *
- * This program is distributed in the hope that it will be useful,            *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
- * GNU Lesser General Public License for more details.                        *
- *                                                                            *
- * You should have received a copy of the GNU Lesser General Public License   *
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
- * -------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-
-#include "gputypes.h"
-#include "cudatypes.h"
-
-static __constant__ cudaGmxSimulation cSim;
-static __constant__ Expression<256> forceExp;
-static __constant__ Expression<256> energyExp;
-
-#include "kEvaluateExpression.h"
-
-void SetCalculateCustomTorsionForcesSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-
-void GetCalculateCustomTorsionForcesSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-
-void SetCustomTorsionForceExpression(const Expression<256>& expression)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(forceExp, &expression, sizeof(forceExp));
-    RTERROR(status, "SetCustomTorsionForceExpression: cudaMemcpyToSymbol failed");
-}
-
-void SetCustomTorsionEnergyExpression(const Expression<256>& expression)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(energyExp, &expression, sizeof(energyExp));
-    RTERROR(status, "SetCustomTorsionEnergyExpression: cudaMemcpyToSymbol failed");
-}
-
-void SetCustomTorsionGlobalParams(const vector<float>& paramValues)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(globalParams, &paramValues[0], paramValues.size()*sizeof(float));
-    RTERROR(status, "SetCustomTorsionGlobalParams: cudaMemcpyToSymbol failed");
-}
-
-#define LOCAL_HACK_PI 3.1415926535897932384626433832795
-
-#define DOT3(v1, v2) (v1.x*v2.x + v1.y*v2.y + v1.z*v2.z)
-
-#define CROSS_PRODUCT(v1, v2) make_float3(v1.y*v2.z - v1.z*v2.y, v1.z*v2.x - v1.x*v2.z, v1.x*v2.y - v1.y*v2.x)
-
-#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
-{ \
-    dp          = DOT3(v1, v2); \
-    float norm1 = DOT3(v1, v1); \
-    float norm2 = DOT3(v2, v2); \
-    dp /= sqrtf(norm1 * norm2); \
-    dp = min(dp, 1.0f); \
-    dp = max(dp, -1.0f); \
-}
-
-#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
-{ \
-    float dp; \
-    GETNORMEDDOTPRODUCT(v1, v2, dp); \
-    if (dp > 0.99f || dp < -0.99f) { \
-        float3 cross = CROSS_PRODUCT(v1, v2); \
-        float scale = DOT3(v1, v1)*DOT3(v2, v2); \
-        angle = asinf(sqrtf(DOT3(cross, cross)/scale)); \
-        if (dp < 0.0f) \
-            angle = LOCAL_HACK_PI-angle; \
-    } \
-    else { \
-        angle = acosf(dp); \
-    } \
-}
-
-#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
-{ \
-    cp0 = CROSS_PRODUCT(vector1, vector2); \
-    cp1 = CROSS_PRODUCT(vector2, vector3); \
-    GETANGLEBETWEENTWOVECTORS(cp0, cp1, angle); \
-    float dp = DOT3(signVector, cp1); \
-    angle = (dp >= 0) ? angle : -angle; \
-}
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_LOCALFORCES_THREADS_PER_BLOCK, 1)
-#endif
-void kCalculateCustomTorsionForces_kernel()
-{
-    extern __shared__ float stack[];
-    float* variables = (float*) &stack[cSim.customExpressionStackSize*blockDim.x];
-    unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
-    float totalEnergy = 0.0f;
-
-    while (pos < cSim.customTorsions)
-    {
-        int4 atom       = cSim.pCustomTorsionID1[pos];
-        int4 atom2      = cSim.pCustomTorsionID2[pos];
-        float4 params   = cSim.pCustomTorsionParams[pos];
-        float4 a1       = cSim.pPosq[atom.x];
-        float4 a2       = cSim.pPosq[atom.y];
-        float4 a3       = cSim.pPosq[atom.z];
-        float4 a4       = cSim.pPosq[atom.w];
-        float3 v0 = make_float3(a1.x-a2.x, a1.y-a2.y, a1.z-a2.z);
-        float3 v1 = make_float3(a3.x-a2.x, a3.y-a2.y, a3.z-a2.z);
-        float3 v2 = make_float3(a3.x-a4.x, a3.y-a4.y, a3.z-a4.z);
-        float3 cp0, cp1;
-        float dihedralAngle;
-        GETDIHEDRALANGLEBETWEENTHREEVECTORS(v0, v1, v2, v0, cp0, cp1, dihedralAngle);
-        VARIABLE(0) = dihedralAngle;
-        VARIABLE(1) = params.x;
-        VARIABLE(2) = params.y;
-        VARIABLE(3) = params.z;
-        VARIABLE(4) = params.w;
-        float dEdAngle = kEvaluateExpression_kernel(&forceExp, stack, variables);
-        totalEnergy += kEvaluateExpression_kernel(&energyExp, stack, variables);
-        float normBC = sqrtf(DOT3(v1, v1));
-        float dp = 1.0f / DOT3(v1, v1);
-        float4 ff = make_float4((-dEdAngle*normBC)/DOT3(cp0, cp0), DOT3(v0, v1)*dp, DOT3(v2, v1)*dp, (dEdAngle*normBC)/DOT3(cp1, cp1));
-        float3 internalF0 = make_float3(ff.x*cp0.x, ff.x*cp0.y, ff.x*cp0.z);
-        float3 internalF3 = make_float3(ff.w*cp1.x, ff.w*cp1.y, ff.w*cp1.z);
-        float3 s = make_float3(ff.y*internalF0.x - ff.z*internalF3.x,
-                               ff.y*internalF0.y - ff.z*internalF3.y,
-                               ff.y*internalF0.z - ff.z*internalF3.z);
-        unsigned int offsetA  = atom.x + atom2.x * cSim.stride;
-        unsigned int offsetB  = atom.y + atom2.y * cSim.stride;
-        unsigned int offsetC  = atom.z + atom2.z * cSim.stride;
-        unsigned int offsetD  = atom.w + atom2.w * cSim.stride;
-        float4 forceA         = cSim.pForce4[offsetA];
-        float4 forceB         = cSim.pForce4[offsetB];
-        float4 forceC         = cSim.pForce4[offsetC];
-        float4 forceD         = cSim.pForce4[offsetD];
-        forceA.x             += internalF0.x;
-        forceA.y             += internalF0.y;
-        forceA.z             += internalF0.z;
-        forceB.x             += -internalF0.x + s.x;
-        forceB.y             += -internalF0.y + s.y;
-        forceB.z             += -internalF0.z + s.z;
-        forceC.x             += -internalF3.x - s.x;
-        forceC.y             += -internalF3.y - s.y;
-        forceC.z             += -internalF3.z - s.z;
-        forceD.x             += internalF3.x;
-        forceD.y             += internalF3.y;
-        forceD.z             += internalF3.z;
-        cSim.pForce4[offsetA] = forceA;
-        cSim.pForce4[offsetB] = forceB;
-        cSim.pForce4[offsetC] = forceC;
-        cSim.pForce4[offsetD] = forceD;
-        pos                  += blockDim.x * gridDim.x;
-    }
-    cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy;
-}
-
-void kCalculateCustomTorsionForces(gpuContext gpu)
-{
-//    printf("kCalculateCustomTorsionForces\n");
-    int memoryPerThread = (gpu->sim.customExpressionStackSize+9)*sizeof(float);
-    int maxThreads = (gpu->sharedMemoryPerBlock-16)/memoryPerThread;
-    int threads = min(gpu->sim.localForces_threads_per_block, (maxThreads/64)*64);
-    kCalculateCustomTorsionForces_kernel<<<gpu->sim.blocks, threads, memoryPerThread*threads>>>();
-    LAUNCHERROR("kCalculateCustomTorsionForces");
-}
--- a/platforms/cuda-old/src/kernels/kCalculateGBVIAux.h
+++ b/platforms/cuda-old/src/kernels/kCalculateGBVIAux.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Mark Friedrichs                                                   *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-
-#ifndef __GpuGBVIAUX_H__
-#define __GpuGBVIAUX_H__
-
-/**
- * This file contains subroutines used in evaluating quantities associated w/ the GB/VI function
- */
-
-static __device__ float getGBVI_L( float r, float x, float S )
-{
-
-   float rInv   = 1.0f/r;
-   float xInv   = 1.0f/x;
-
-   float xInv2  = xInv*xInv;
-   float diff2  = (r + S)*(r - S);
-
-   return (1.5f*xInv2)*( (0.25f*rInv) - (xInv/3.0f) + (0.125f*diff2*xInv2*rInv) );
-}
-
-static __device__ float getGBVI_Volume( float r, float R, float S )
-{
-
-    float addOn      = 0.0f;
-    int mask         = 1;
-    float lowerBound = (r - S); 
-
-    float diff       = (S - R); 
-    if( fabsf( diff ) < r ){
-        lowerBound = R > lowerBound ? R : lowerBound; 
-    } else if( r <= diff ){
-        addOn      = (1.0f/(R*R*R));
-    } else {
-        mask       = 0;
-    }   
-    float s2         = getGBVI_L( r, lowerBound, S );
-    float s1         = getGBVI_L( r, (r + S),    S );
-    s1               = mask ? (s1 - s2 + addOn) : 0.0f;
-    return s1;
-}
-
-static __device__ float getGBVI_dL_dr( float r, float x, float S )
-{
-
-   float rInv   = 1.0f/r;
-   float rInv2  = rInv*rInv;
-
-   float xInv   = 1.0f/x;
-   float xInv2  = xInv*xInv;
-   float xInv3  = xInv2*xInv;
-   
-   float diff2  = (r + S)*(r - S);
-
-   return ( (-1.5f*xInv2*rInv2)*( 0.25f + 0.125f*diff2*xInv2 ) + 0.375f*xInv3*xInv );
-
-}
-
-static __device__ float getGBVI_dL_drNew( float r, float x, float S )
-{
-
-   float rInv   = 1.0f/r;
-   float rInv2  = rInv*rInv;
-
-   float xInv   = 1.0f/x;
-   float xInv2  = xInv*xInv;
-   
-   float t1     = (S*rInv);
-         t1     = 1.0f + t1*t1;
-
-   return (-0.375f*xInv2)*( rInv2 - 0.5f*xInv2*t1 );
-
-}
-
-static __device__ float getGBVI_dL_dx( float r, float x, float S )
-{
-
-   float rInv   = 1.0f/r;
-
-   float xInv   = 1.0f/x;
-   float xInv2  = xInv*xInv;
-   float xInv3  = xInv2*xInv;
-
-   float diff   = (r + S)*(r - S);
-
-   return ( (-1.5f*xInv3)*( (0.5f*rInv) - xInv + (0.5f*diff*xInv2*rInv) ));
-
-}
-
-static __device__ float getGBVI_dE2Old( float r, float R, float S, float bornForce )
-{
-
-    float diff              = S - R;
-    float absDiff           = fabsf( S - R );
-    float dE                = getGBVI_dL_dr( r, r+S, S ) + getGBVI_dL_dx( r, r+S, S );
-    float mask;
-    float lowerBound;
-    if( (R > (r - S)) && (absDiff < r) ){
-        mask       = 0.0f;
-        lowerBound = R;
-    } else {
-        mask       = 1.0f;
-        lowerBound = (r - S);  
-    }   
-    float dE2              = getGBVI_dL_dr( r, lowerBound, S ) + mask*getGBVI_dL_dx( r, lowerBound, S );
-    dE                    -= (absDiff >= r) && r >= diff ? 0.0f : dE2;
-    dE                     = r < -diff ? 0.0f : dE;
-
-    dE                    *= ( (r > 1.0e-08f) ? (bornForce/r) : 0.0f);
-
-    return (-dE);
-
-}
-
-static __device__ float getGBVI_dE2( float r, float R, float S, float bornForce )
-{
-    float diff  = S - R;
-    float dE    = 0.0f;
-
-    if( fabsf( diff ) < r ){
-        dE = getGBVI_dL_dr( r, r+S, S ) + getGBVI_dL_dx( r, r+S, S );   
-        float lowerBound;
-        float mask;
-        if( R > (r - S) ){
-            lowerBound = R;
-            mask       = 0.0f;
-        } else {
-            lowerBound = r - S;
-            mask       = 1.0f;
-        }
-        dE -= getGBVI_dL_dr( r, lowerBound, S ) + mask*getGBVI_dL_dx( r, lowerBound, S );
-    } else if( r < (S - R) ){
-        dE  =   getGBVI_dL_dr( r, r+S, S ) + getGBVI_dL_dx( r, r+S, S );   
-        dE -= ( getGBVI_dL_dr( r, r-S, S ) + getGBVI_dL_dx( r, r-S, S ) );   
-    }   
-   
-    dE         *= ( (r > 1.0e-08f) ? (bornForce/r) : 0.0f);
-    return (-dE);
-
-}
-
-static __device__ float getGBVIBornForce2( float bornRadius, float R, float bornForce, float gamma )
-{ 
-    float ratio                     = (R/bornRadius);
-    float returnBornForce           = bornForce + (3.0f*gamma*ratio*ratio*ratio)/bornRadius; // 'cavity' term
-    float br2                       = bornRadius*bornRadius;
-          returnBornForce          *= (1.0f/3.0f)*br2*br2;
-
-   return returnBornForce;
-
-}
-
-#endif // __GpuGBVIAUX_H__
--- a/platforms/cuda-old/src/kernels/kCalculateGBVIBornSum.cu
+++ b/platforms/cuda-old/src/kernels/kCalculateGBVIBornSum.cu
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <vector_functions.h>
-#include <cstdlib>
-#include <string>
-#include <iostream>
-#include <fstream>
-using namespace std;
-
-#include "gputypes.h"
-
-#define UNROLLXX 0
-#define UNROLLXY 0
-
-struct Atom {
-    float x;
-    float y;
-    float z;
-    float r;
-    float sr;
-    float sum;
-    float gamma;
-};
-
-static __constant__ cudaGmxSimulation cSim;
-
-void SetCalculateGBVIBornSumSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
-}
-
-void GetCalculateGBVIBornSumSim(gpuContext gpu)
-{
-    cudaError_t status;
-    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));     
-    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
-}
-
-// Include versions of the kernels for N^2 calculations.
-
-#define METHOD_NAME(a, b) a##N2##b
-#include "kCalculateGBVIBornSum.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##N2ByWarp##b
-#include "kCalculateGBVIBornSum.h"
-
-// Include versions of the kernels with cutoffs.
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_CUTOFF
-#define METHOD_NAME(a, b) a##Cutoff##b
-#include "kCalculateGBVIBornSum.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##CutoffByWarp##b
-#include "kCalculateGBVIBornSum.h"
-
-// Include versions of the kernels with periodic boundary conditions.
-
-#undef METHOD_NAME
-#undef USE_OUTPUT_BUFFER_PER_WARP
-#define USE_PERIODIC
-#define METHOD_NAME(a, b) a##Periodic##b
-#include "kCalculateGBVIBornSum.h"
-#define USE_OUTPUT_BUFFER_PER_WARP
-#undef METHOD_NAME
-#define METHOD_NAME(a, b) a##PeriodicByWarp##b
-#include "kCalculateGBVIBornSum.h"
-
-/**---------------------------------------------------------------------------------------
-
-   Compute quintic spline value and associated derviative
-
-   @param x                   value to compute spline at
-   @param rl                  lower cutoff value
-   @param ru                  upper cutoff value
-   @param outValue            value of spline at x
-   @param outDerivative       value of derivative of spline at x
-
-   --------------------------------------------------------------------------------------- */
-
-static __device__ void quinticSpline_kernel( float x, float rl, float ru,
-                                             float* outValue, float* outDerivative ){
-
-   // ---------------------------------------------------------------------------------------
-
-   const float one           =    1.0f;
-   const float minusSix      =   -6.0f;
-   const float minusTen      =  -10.0f;
-   const float minusThirty   =  -30.0f;
-   const float fifteen       =   15.0f;
-   const float sixty         =   60.0f;
-
-   // ---------------------------------------------------------------------------------------
-
-   float numerator    = x  - rl;
-   float denominator  = ru - rl;
-   float ratio        = numerator/denominator;
-   float ratio2       = ratio*ratio;
-   float ratio3       = ratio2*ratio;
-
-   *outValue               = one + ratio3*(minusTen + fifteen*ratio + minusSix*ratio2);
-   *outDerivative          = ratio2*(minusThirty + sixty*ratio + minusThirty*ratio2)/denominator;
-}
-
-/**---------------------------------------------------------------------------------------
-
-   Compute Born radii based on Eq. 3 of Labute paper [JCC 29 p. 1693-1698 2008])
-   and quintic splice switching function
-
-   @param atomicRadius3       atomic radius cubed
-   @param bornSum             Born sum (volume integral)
-   @param bornRadius          output Born radius
-   @param switchDeriviative   output switching function deriviative
-
-   --------------------------------------------------------------------------------------- */
-
-__device__ void computeBornRadiiUsingQuinticSpline( float atomicRadius3, float bornSum,
-                                                    float* bornRadius, float* switchDeriviative ){
-
-   // ---------------------------------------------------------------------------------------
-
-   const float zero          =   0.0f;
-   const float one           =   1.0f;
-   const float minusOneThird =  (-1.0f/3.0f);
-
-   // ---------------------------------------------------------------------------------------
-
-   // R                = [ S(V)*(A - V) ]**(-1/3)
-
-   // S(V)             = 1                                 V < L
-   // S(V)             = qSpline + U/(A-V)                 L < V < A
-   // S(V)             = U/(A-V)                           U < V 
-
-   // dR/dr            = (-1/3)*[ S(V)*(A - V) ]**(-4/3)*[ d{ S(V)*(A-V) }/dr
-
-   // d{ S(V)*(A-V) }/dr   = (dV/dr)*[ (A-V)*dS/dV - S(V) ]
-
-   //  (A - V)*dS/dV - S(V)  = 0 - 1                             V < L
-
-   //  (A - V)*dS/dV - S(V)  = (A-V)*d(qSpline) + (A-V)*U/(A-V)**2 - qSpline - U/(A-V) 
-
-	//                        = (A-V)*d(qSpline) - qSpline        L < V < A**(-3)
-
-   //  (A - V)*dS/dV - S(V)  = (A-V)*U*/(A-V)**2 - U/(A-V) = 0   U < V
-
-   float splineL          = cSim.gbviQuinticLowerLimitFactor*atomicRadius3;
-   float sum;
-   if( bornSum > splineL ){
-      if( bornSum < atomicRadius3 ){
-         float splineValue, splineDerivative;
-         quinticSpline_kernel( bornSum, splineL, atomicRadius3, &splineValue, &splineDerivative ); 
-         sum                 = (atomicRadius3 - bornSum)*splineValue + cSim.gbviQuinticUpperBornRadiusLimit;
-         *switchDeriviative  = splineValue - (atomicRadius3 - bornSum)*splineDerivative;
-      } else {   
-         sum                 = cSim.gbviQuinticUpperBornRadiusLimit;
-         *switchDeriviative  = zero;
-      }
-   } else {
-      sum                = atomicRadius3 - bornSum; 
-      *switchDeriviative = one;
-   }
-   *bornRadius = powf( sum, minusOneThird );
-}
-
-__global__ void kReduceGBVIBornSum_kernel()
-{
-    unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
-    
-    while (pos < cSim.atoms)
-    {
-        float sum = 0.0f;
-        float* pSt = cSim.pBornSum + pos;
-        float4 atom = cSim.pGBVIData[pos];
-        
-        // Get summed Born data
-        for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
-        {
-            sum += *pSt;
-            pSt += cSim.stride;
-        }
-        
-        // Now calculate Born radius
-
-        float Rinv           = 1.0f/atom.x;
-        Rinv                 = Rinv*Rinv*Rinv;
-        if( cSim.gbviBornRadiusScalingMethod == 0 ){
-            sum                             = Rinv - sum; 
-            cSim.pBornRadii[pos]            = powf( sum, (-1.0f/3.0f) ); 
-            cSim.pGBVISwitchDerivative[pos] = 1.0f; 
-        } else {
-            float bornRadius;
-            float switchDeriviative;
-            computeBornRadiiUsingQuinticSpline( Rinv, sum, &bornRadius, &switchDeriviative );
-            cSim.pBornRadii[pos]             = bornRadius; 
-            cSim.pGBVISwitchDerivative[pos]  = switchDeriviative; 
-        }
-        pos += gridDim.x * blockDim.x;
-    }   
-}
-
-void kReduceGBVIBornSum(gpuContext gpu)
-{
-    //printf("kReduceGBVIBornSum\n");
-    kReduceGBVIBornSum_kernel<<<gpu->sim.blocks, 384>>>();
-    gpu->bRecalculateBornRadii = false;
-    LAUNCHERROR("kReduceGBVIBornSum");
-}
-
-static int isNanOrInfinity( float number ){
-    return (number != number || number == std::numeric_limits<float>::infinity() || number == -std::numeric_limits<float>::infinity()) ? 1 : 0; 
-}
-
-void kPrintGBVI( gpuContext gpu, std::string callId, int call, FILE* log)
-{
-
-    gpu->psGBVIData->Download();
-    gpu->psBornRadii->Download();
-    gpu->psGBVISwitchDerivative->Download();
-    gpu->psBornForce->Download();
-    gpu->psPosq4->Download();
-    gpu->psSigEps2->Download();
-
-    int printOnlyOnNan = 1;
-    int foundNan       = 0;
-    if( printOnlyOnNan ){
-        for( unsigned int ii = 0; ii < gpu->sim.paddedNumberOfAtoms && foundNan == 0; ii++ ){
-            foundNan  += isNanOrInfinity( gpu->psBornRadii->_pSysData[ii] );
-            foundNan  += isNanOrInfinity( gpu->psBornForce->_pSysData[ii] );
-            foundNan  += isNanOrInfinity( gpu->psGBVISwitchDerivative->_pSysData[ii] );
-        }
-        if( foundNan ){
-            log = stderr;
-            (void) fprintf( log, "kPrintGBVI found nan \n", gpu->sim.paddedNumberOfAtoms );
-            for( unsigned int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
-                (void) fprintf( log, "%6d %15.7e %15.7e %15.7e\n", ii,
-                                gpu->psPosq4->_pSysData[ii].x,
-                                gpu->psPosq4->_pSysData[ii].y,
-                                gpu->psPosq4->_pSysData[ii].z );
-            }
-        }
-    }
-    if( !printOnlyOnNan || foundNan ){
-        (void) fprintf( log, "kPrintGBVI Cuda comp bR bF prm    sigeps2\n" );
-        (void) fprintf( stderr, "kCalculateGBVIBornSum: bOutputBufferPerWarp=%u blks=%u th/blk=%u wu=%u %u shrd=%u\n", gpu->bOutputBufferPerWarp,
-                        gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, gpu->sim.workUnits, gpu->psWorkUnit->_pSysStream[0][0],
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block );
-        (void) fprintf( stderr, "bR bF swd r scR ...\n" );
-        for( unsigned int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
-            (void) fprintf( log, "%6d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e\n", ii,
-                        gpu->psBornRadii->_pSysData[ii],
-                        gpu->psBornForce->_pSysData[ii],
-                        gpu->psGBVISwitchDerivative->_pSysData[ii],
-                        gpu->psGBVIData->_pSysData[ii].x,
-                        gpu->psGBVIData->_pSysData[ii].y,
-                        gpu->psGBVIData->_pSysData[ii].z,
-                        gpu->psGBVIData->_pSysData[ii].w,
-
-                        gpu->psSigEps2->_pSysData[ii].x,
-                        gpu->psSigEps2->_pSysData[ii].y );
-
-        }   
-        if( foundNan ){
-            exit(0);
-        }
-    }   
-
-}
-
-void kCalculateGBVIBornSum(gpuContext gpu)
-{
-    //printf("kCalculateGBVIBornSum\n");
-
-    switch (gpu->sim.nonbondedMethod)
-    {
-        case NO_CUTOFF:
-            if (gpu->bOutputBufferPerWarp){
-                kCalculateGBVIN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
-            } else {
-                kCalculateGBVIN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
-            }
-            break;
-
-        case CUTOFF:
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVICutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
-            else
-                kCalculateGBVICutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
-            break;
-
-        case PERIODIC:
-            if (gpu->bOutputBufferPerWarp)
-                kCalculateGBVIPeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
-            else
-                kCalculateGBVIPeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
-                        (sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit );
-            break;
-
-    }
-    LAUNCHERROR("kCalculateGBVIBornSum");
-}
--- a/platforms/cuda-old/src/kernels/kCalculateGBVIBornSum.h
+++ b/platforms/cuda-old/src/kernels/kCalculateGBVIBornSum.h
-/* -------------------------------------------------------------------------- *
- *                                   OpenMM                                   *
- * -------------------------------------------------------------------------- *
- * This is part of the OpenMM molecular simulation toolkit originating from   *
- * Simbios, the NIH National Center for Physics-Based Simulation of           *
- * Biological Structures at Stanford, funded under the NIH Roadmap for        *
- * Medical Research, grant U54 GM072970. See https://simtk.org.               *
- *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
- * Authors: Scott Le Grand, Peter Eastman                                     *
- * Contributors:                                                              *
- *                                                                            *
- * Permission is hereby granted, free of charge, to any person obtaining a    *
- * copy of this software and associated documentation files (the "Software"), *
- * to deal in the Software without restriction, including without limitation  *
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
- * and/or sell copies of the Software, and to permit persons to whom the      *
- * Software is furnished to do so, subject to the following conditions:       *
- *                                                                            *
- * The above copyright notice and this permission notice shall be included in *
- * all copies or substantial portions of the Software.                        *
- *                                                                            *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
- * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
- * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
- * -------------------------------------------------------------------------- */
-
-/**
- * This file contains the kernel for calculating Born sums.  It is included
- * several times in kCalculateGBVIBornSum.cu with different #defines to generate
- * different versions of the kernels.
- */
-
-#include "kCalculateGBVIAux.h"
-
-__global__ 
-#if (__CUDA_ARCH__ >= 200)
-__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
-#elif (__CUDA_ARCH__ >= 120)
-__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
-#else
-__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
-#endif
-void METHOD_NAME(kCalculateGBVI, BornSum_kernel)(unsigned int* workUnit)
-{
-    extern __shared__ volatile Atom sA[];
-
-    unsigned int totalWarps       = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
-    unsigned int warp             = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    unsigned int numWorkUnits     = cSim.pInteractionCount[0];
-    unsigned int pos              = warp*numWorkUnits/totalWarps;
-    unsigned int end              = (warp+1)*numWorkUnits/totalWarps;
-
-#ifdef USE_CUTOFF
-    volatile float* tempBuffer    = (volatile float*) &sA[cSim.nonbond_threads_per_block];
-#endif
-
-    while ( pos < end )
-    {
-        // Extract cell coordinates from appropriate work unit
-        unsigned int x = workUnit[pos];
-        unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
-        x              = (x >> 17)           << GRIDBITS;
-
-        float       dx;
-        float       dy;
-        float       dz;
-        float       r2;
-        float       r;
-
-        // forces tgx into interval [0,31] 
-        // forces tbx 0 
-        unsigned int tgx     = threadIdx.x & (GRID - 1);
-        unsigned int tbx     = threadIdx.x - tgx;
-        unsigned int tj      = tgx;
-        volatile Atom* psA   = &sA[tbx];
-
-        if (x == y) // Handle diagonals uniquely at 50% efficiency
-        {
-            // Read fixed atom data into registers and GRF
-            unsigned int i                          = x + tgx;
-            float4 apos                             = cSim.pPosq[i];    // Local atom x, y, z, sum
-            float4 ar                               = cSim.pGBVIData[i];  // Local atom vr, sr
-            sA[threadIdx.x].x                       = apos.x;
-            sA[threadIdx.x].y                       = apos.y;
-            sA[threadIdx.x].z                       = apos.z;
-            sA[threadIdx.x].r                       = ar.x;
-            sA[threadIdx.x].sr                      = ar.y;
-            apos.w                                  = 0.0f;
-
-            for (unsigned int j = 0; j < GRID; j++)
-            {
-                dx                                  = psA[j].x - apos.x;
-                dy                                  = psA[j].y - apos.y;
-                dz                                  = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                dx                                 -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                dy                                 -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                dz                                 -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                r2                                  = dx * dx + dy * dy + dz * dz;
-#if defined USE_CUTOFF
-                if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr && j != tgx)
-#else
-                if (i < cSim.atoms && x+j < cSim.atoms && j != tgx )
-#endif
-                {
-                    r                       = sqrtf(r2);
-                    apos.w                 += getGBVI_Volume( r, ar.x, psA[j].sr );
-                }
-            }
-
-            // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset = x + tgx + warp*cSim.stride;
-            cSim.pBornSum[offset] += apos.w;
-#else
-            unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = apos.w;
-#endif
-        } else {
-
-            // Read fixed atom data into registers and GRF
-            unsigned int j                              = y + tgx;
-            unsigned int i                              = x + tgx;
-
-            float4 temp                                 = cSim.pPosq[j];
-            float4 temp1                                = cSim.pGBVIData[j];
-
-            float4 apos                                 = cSim.pPosq[i];        // Local atom x, y, z, sum
-            float4 ar                                   = cSim.pGBVIData[i];    // Local atom vr, sr
-
-            sA[threadIdx.x].x                           = temp.x;
-            sA[threadIdx.x].y                           = temp.y;
-            sA[threadIdx.x].z                           = temp.z;
-
-            sA[threadIdx.x].r                           = temp1.x;
-            sA[threadIdx.x].sr                          = temp1.y;
-
-            sA[threadIdx.x].sum                         = 0.0f;
-            apos.w                                      = 0.0f;
-
-#ifdef USE_CUTOFF
-            unsigned int flags                          = cSim.pInteractionFlag[pos];
-            if (flags == 0)
-            {
-                // No interactions in this block.
-            }
-            else if (flags == 0xFFFFFFFF)
-#endif
-            {
-                // Compute all interactions within this block.
-
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    dx                      = psA[tj].x - apos.x;
-                    dy                      = psA[tj].y - apos.y;
-                    dz                      = psA[tj].z - apos.z;
-#ifdef USE_PERIODIC
-                    dx                     -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                    dy                     -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                    dz                     -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                    r2                      = dx * dx + dy * dy + dz * dz;
-#ifdef USE_CUTOFF
-                    if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#else
-                    if (i < cSim.atoms && y+tj < cSim.atoms )
-#endif
-
-                    {
-                        r                       = sqrtf(r2);
-                        apos.w                 += getGBVI_Volume( r, ar.x,      psA[tj].sr );
-                        psA[tj].sum            += getGBVI_Volume( r, psA[tj].r, ar.y );
-                    }
-                    tj = (tj - 1) & (GRID - 1);
-                }
-            }
-#ifdef USE_CUTOFF
-            else
-            {
-                // Compute only a subset of the interactions in this block.
-
-                for (unsigned int j = 0; j < GRID; j++)
-                {
-                    if ((flags&(1<<j)) != 0)
-                    {
-                        tempBuffer[threadIdx.x] = 0.0f;
-                        dx                      = psA[j].x - apos.x;
-                        dy                      = psA[j].y - apos.y;
-                        dz                      = psA[j].z - apos.z;
-#ifdef USE_PERIODIC
-                        dx                     -= floorf(dx*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
-                        dy                     -= floorf(dy*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
-                        dz                     -= floorf(dz*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
-#endif
-                        r2                      = dx * dx + dy * dy + dz * dz;
-#ifdef USE_CUTOFF
-                        if (i < cSim.atoms && y+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
-#else
-                        if (i < cSim.atoms && y+j < cSim.atoms)
-#endif
-                        {
-                            r                       = sqrtf(r2);
-                            apos.w                 += getGBVI_Volume( r, ar.x,      psA[j].sr );
-                            tempBuffer[threadIdx.x] = getGBVI_Volume( r, psA[j].r, ar.y );
-                        }
-
-                        // Sum the terms.
-
-                        if (tgx % 2 == 0)
-                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
-                        if (tgx % 4 == 0)
-                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
-                        if (tgx % 8 == 0)
-                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
-                        if (tgx % 16 == 0)
-                            tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
-                        if (tgx == 0)
-                            psA[j].sum += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
-                    }
-                }
-            }
-#endif
-
-            // Write results
-#ifdef USE_OUTPUT_BUFFER_PER_WARP
-            unsigned int offset = x + tgx + warp*cSim.stride;
-            cSim.pBornSum[offset] += apos.w;
-            offset = y + tgx + warp*cSim.stride;
-            cSim.pBornSum[offset] += sA[threadIdx.x].sum;
-#else
-            unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = apos.w;
-            offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
-            cSim.pBornSum[offset] = sA[threadIdx.x].sum;
-#endif
-        }
-
-        pos++;
-    }
-}