Merge https://github.com/openmm/openmm

5a06df78 · tic20 · 8dd60914 · a9223eea · 5a06df78 · 5a06df78
Commit 5a06df78 authored Mar 04, 2020 by tic20
20 changed files
--- a/platforms/cuda/src/kernels/customNonbonded.cu
+++ b/platforms/cuda/src/kernels/customNonbonded.cu
--- a/platforms/opencl/src/kernels/customNonbondedGroups.cl
+++ b/platforms/opencl/src/kernels/customNonbondedGroups.cl
-#ifdef SUPPORTS_64_BIT_ATOMICS
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-#endif
-
 typedef struct {
    real x, y, z;
    real q;
@@ -16,60 +12,69 @@ typedef struct {
 * Find the maximum of a value across all threads in a warp, and return that to
 * every thread.
 */
-int reduceMax(int val, __local int* temp) {
-    int indexInWarp = get_local_id(0)%32;
-    temp[get_local_id(0)] = val;
+DEVICE int reduceMax(int val, LOCAL_ARG int* temp) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+    // CUDA lets us do this slightly more efficiently by using shuffle operations.
+    for (int mask = 16; mask > 0; mask /= 2)
+        val = max(val, __shfl_xor_sync(0xffffffff, val, mask));
+    return val;
+#else
+    int indexInWarp = LOCAL_ID%32;
+    temp[LOCAL_ID] = val;
    SYNC_WARPS;
    for (int offset = 16; offset > 0; offset /= 2) {
-        if (offset < indexInWarp)
-            temp[get_local_id(0)] = max(temp[get_local_id(0)], temp[get_local_id(0)+offset]);
+        if (indexInWarp < offset)
+            temp[LOCAL_ID] = max(temp[LOCAL_ID], temp[LOCAL_ID+offset]);
        SYNC_WARPS;
    }
-    return temp[get_local_id(0)-indexInWarp];
+    return temp[LOCAL_ID-indexInWarp];
+#endif
 }

+#ifndef SUPPORTS_64_BIT_ATOMICS
 /**
 * This function is used on devices that don't support 64 bit atomics.  Multiple threads within
 * a single tile might have computed forces on the same atom.  This loops over them and makes sure
 * that only one thread updates the force on any given atom.
 */
-void writeForces(__global real4* forceBuffers,__local AtomData* localData, int atomIndex) {
-    localData[get_local_id(0)].x = atomIndex;
+void writeForces(GLOBAL real4* forceBuffers, LOCAL AtomData* localData, int atomIndex) {
+    localData[LOCAL_ID].x = atomIndex;
    SYNC_WARPS;
-    real4 forceSum = (real4) 0;
-    int start = (get_local_id(0)/TILE_SIZE)*TILE_SIZE;
+    real4 forceSum = make_real4(0);
+    int start = (LOCAL_ID/TILE_SIZE)*TILE_SIZE;
    int end = start+32;
    bool isFirst = true;
    for (int i = start; i < end; i++)
        if (localData[i].x == atomIndex) {
            forceSum += (real4) (localData[i].fx, localData[i].fy, localData[i].fz, 0);
-            isFirst &= (i >= get_local_id(0));
+            isFirst &= (i >= LOCAL_ID);
        }
-    const unsigned int warp = get_global_id(0)/TILE_SIZE;
+    const unsigned int warp = GLOBAL_ID/TILE_SIZE;
    unsigned int offset = atomIndex + warp*PADDED_NUM_ATOMS;
    if (isFirst)
        forceBuffers[offset] += forceSum;
    SYNC_WARPS;
 }
+#endif

-__kernel void computeInteractionGroups(
+KERNEL void computeInteractionGroups(
 #ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict forceBuffers,
+        GLOBAL mm_ulong* RESTRICT forceBuffers,
 #else
-        __global real4* restrict forceBuffers,
+        GLOBAL real4* RESTRICT forceBuffers,
 #endif
-        __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict groupData,
-        __global int* restrict numGroupTiles, int useNeighborList,
+        GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT groupData,
+        GLOBAL const int* RESTRICT numGroupTiles, int useNeighborList,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        PARAMETER_ARGUMENTS) {
-    const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
-    const unsigned int warp = get_global_id(0)/TILE_SIZE; // global warpIndex
-    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); // index within the warp
-    const unsigned int tbx = get_local_id(0) - tgx;           // block warpIndex
+    const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
+    const unsigned int warp = GLOBAL_ID/TILE_SIZE; // global warpIndex
+    const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1); // index within the warp
+    const unsigned int tbx = LOCAL_ID - tgx;           // block warpIndex
    mixed energy = 0;
    INIT_DERIVATIVES
-    __local AtomData localData[LOCAL_MEMORY_SIZE];
-    __local int reductionBuffer[LOCAL_MEMORY_SIZE];
+    LOCAL AtomData localData[LOCAL_MEMORY_SIZE];
+    LOCAL int reductionBuffer[LOCAL_MEMORY_SIZE];

    const unsigned int startTile = (useNeighborList ? warp*numGroupTiles[0]/totalWarps : FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps);
    const unsigned int endTile = (useNeighborList ? (warp+1)*numGroupTiles[0]/totalWarps : FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps);
@@ -82,16 +87,16 @@ __kernel void computeInteractionGroups(
        const int exclusions = atomData.w;
        real4 posq1 = posq[atom1];
        LOAD_ATOM1_PARAMETERS
-        real4 force = (real4) (0);
+        real3 force = make_real3(0);
        real4 posq2 = posq[atom2];
-        localData[get_local_id(0)].x = posq2.x;
-        localData[get_local_id(0)].y = posq2.y;
-        localData[get_local_id(0)].z = posq2.z;
-        localData[get_local_id(0)].q = posq2.w;
+        localData[LOCAL_ID].x = posq2.x;
+        localData[LOCAL_ID].y = posq2.y;
+        localData[LOCAL_ID].z = posq2.z;
+        localData[LOCAL_ID].q = posq2.w;
        LOAD_LOCAL_PARAMETERS
-        localData[get_local_id(0)].fx = 0.0f;
-        localData[get_local_id(0)].fy = 0.0f;
-        localData[get_local_id(0)].fz = 0.0f;
+        localData[LOCAL_ID].fx = 0.0f;
+        localData[LOCAL_ID].fy = 0.0f;
+        localData[LOCAL_ID].fz = 0.0f;
        int tj = tgx;
        int rangeStop = rangeStart + reduceMax(rangeEnd-rangeStart, reductionBuffer);
        SYNC_WARPS;
@@ -99,8 +104,8 @@ __kernel void computeInteractionGroups(
            if (j < rangeEnd) {
                bool isExcluded = (((exclusions>>tj)&1) == 0);
                int localIndex = tbx+tj;
-                posq2 = (real4) (localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q);
-                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                posq2 = make_real4(localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q);
+                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
 #ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -117,35 +122,38 @@ __kernel void computeInteractionGroups(
                    COMPUTE_INTERACTION
                    energy += tempEnergy;
                    delta *= dEdR;
-                    force.xyz -= delta.xyz;
+                    force.x -= delta.x;
+                    force.y -= delta.y;
+                    force.z -= delta.z;
                    localData[localIndex].fx += delta.x;
                    localData[localIndex].fy += delta.y;
                    localData[localIndex].fz += delta.z;
 #ifdef USE_CUTOFF
                }
 #endif
+                tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
            }
-            tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
            SYNC_WARPS;
        }
 #ifdef SUPPORTS_64_BIT_ATOMICS
        if (exclusions != 0) {
-            atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
-            atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-            atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+            ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) ((mm_long) (force.x*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
        }
-        atom_add(&forceBuffers[atom2], (long) (localData[get_local_id(0)].fx*0x100000000));
-        atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
-        atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
+        ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fx*0x100000000)));
+        ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fy*0x100000000)));
+        ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fz*0x100000000)));
+        SYNC_WARPS;
 #else
        writeForces(forceBuffers, localData, atom2);
-        localData[get_local_id(0)].fx = force.x;
-        localData[get_local_id(0)].fy = force.y;
-        localData[get_local_id(0)].fz = force.z;
+        localData[LOCAL_ID].fx = force.x;
+        localData[LOCAL_ID].fy = force.y;
+        localData[LOCAL_ID].fz = force.z;
        writeForces(forceBuffers, localData, atom1);
 #endif
    }
-    energyBuffer[get_global_id(0)] += energy;
+    energyBuffer[GLOBAL_ID] += energy;
    SAVE_DERIVATIVES
 }

@@ -153,7 +161,7 @@ __kernel void computeInteractionGroups(
 * If the neighbor list needs to be rebuilt, reset the number of tiles to 0.  This is
 * executed by a single thread.
 */
-__kernel void prepareToBuildNeighborList(__global int* restrict rebuildNeighborList, __global int* restrict numGroupTiles) {
+KERNEL void prepareToBuildNeighborList(GLOBAL int* RESTRICT rebuildNeighborList, GLOBAL int* RESTRICT numGroupTiles) {
    if (rebuildNeighborList[0] == 1)
        numGroupTiles[0] = 0;
 }
@@ -162,8 +170,8 @@ __kernel void prepareToBuildNeighborList(__global int* restrict rebuildNeighborL
 * Filter the list of tiles to include only ones that have interactions within the
 * padded cutoff.
 */
-__kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __global int* restrict numGroupTiles,
-        __global const real4* restrict posq, __global const int4* restrict groupData, __global int4* restrict filteredGroupData,
+KERNEL void buildNeighborList(GLOBAL int* RESTRICT rebuildNeighborList, GLOBAL int* RESTRICT numGroupTiles,
+        GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT groupData, GLOBAL int4* RESTRICT filteredGroupData,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
    
    // If the neighbor list doesn't need to be rebuilt on this step, return immediately.
@@ -171,15 +179,15 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
    if (rebuildNeighborList[0] == 0)
        return;

-    const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
-    const unsigned int warp = get_global_id(0)/TILE_SIZE; // global warpIndex
-    const unsigned int local_warp = get_local_id(0)/TILE_SIZE; // local warpIndex
-    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); // index within the warp
-    const unsigned int tbx = get_local_id(0) - tgx;           // block warpIndex
-    __local real4 localPos[LOCAL_MEMORY_SIZE];
-    __local volatile bool anyInteraction[WARPS_IN_BLOCK];
-    __local volatile int tileIndex[WARPS_IN_BLOCK];
-    __local int reductionBuffer[LOCAL_MEMORY_SIZE];
+    const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
+    const unsigned int warp = GLOBAL_ID/TILE_SIZE; // global warpIndex
+    const unsigned int local_warp = LOCAL_ID/TILE_SIZE; // local warpIndex
+    const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1); // index within the warp
+    const unsigned int tbx = LOCAL_ID - tgx;           // block warpIndex
+    LOCAL real4 localPos[LOCAL_MEMORY_SIZE];
+    LOCAL volatile bool anyInteraction[WARPS_IN_BLOCK];
+    LOCAL volatile int tileIndex[WARPS_IN_BLOCK];
+    LOCAL int reductionBuffer[LOCAL_MEMORY_SIZE];

    const unsigned int startTile = warp*NUM_TILES/totalWarps;
    const unsigned int endTile = (warp+1)*NUM_TILES/totalWarps;
@@ -191,7 +199,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
        const int rangeEnd = (atomData.z>>16)&0xFFFF;
        const int exclusions = atomData.w;
        real4 posq1 = posq[atom1];
-        localPos[get_local_id(0)] = posq[atom2];
+        localPos[LOCAL_ID] = posq[atom2];
        if (tgx == 0)
            anyInteraction[local_warp] = false;
        int tj = tgx;
@@ -199,10 +207,10 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
        SYNC_WARPS;
        for (int j = rangeStart; j < rangeStop && !anyInteraction[local_warp]; j++) {
            SYNC_WARPS;
-            if (j < rangeEnd) {
+            if (j < rangeEnd && tj < rangeEnd) {
                bool isExcluded = (((exclusions>>tj)&1) == 0);
                int localIndex = tbx+tj;
-                real4 delta = (real4) (localPos[localIndex].xyz - posq1.xyz, 0);
+                real3 delta = make_real3(localPos[localIndex].x-posq1.x, localPos[localIndex].y-posq1.y, localPos[localIndex].z-posq1.z);
 #ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -216,7 +224,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
        if (anyInteraction[local_warp]) {
            SYNC_WARPS;
            if (tgx == 0)
-                tileIndex[local_warp] = atomic_add(numGroupTiles, 1);
+                tileIndex[local_warp] = ATOMIC_ADD(numGroupTiles, 1);
            SYNC_WARPS;
            filteredGroupData[TILE_SIZE*tileIndex[local_warp]+tgx] = atomData;
        }

--- a/platforms/cuda/src/kernels/gayBerne.cu
+++ b/platforms/cuda/src/kernels/gayBerne.cu
@@ -4,10 +4,10 @@
 /**
 * Calculate the ellipsoid coordinate frames and associated matrices.
 */
-extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4* __restrict__ posq, int2* const __restrict__ axisParticleIndices,
-        const float4* __restrict__ sigParams, const float4* __restrict__ scale, real* __restrict__ aMatrix,
-        real* __restrict__ bMatrix, real* __restrict__ gMatrix, const int* sortedParticles) {
-    for (int sortedIndex = blockIdx.x*blockDim.x+threadIdx.x; sortedIndex < numParticles; sortedIndex += blockDim.x*gridDim.x) {
+KERNEL void computeEllipsoidFrames(int numParticles, GLOBAL const real4* RESTRICT posq, GLOBAL int2* const RESTRICT axisParticleIndices,
+        GLOBAL const float4* RESTRICT sigParams, GLOBAL const float4* RESTRICT scale, GLOBAL real* RESTRICT aMatrix,
+        GLOBAL real* RESTRICT bMatrix, GLOBAL real* RESTRICT gMatrix, GLOBAL const int* sortedParticles) {
+    for (int sortedIndex = GLOBAL_ID; sortedIndex < numParticles; sortedIndex += GLOBAL_SIZE) {
        // Compute the local coordinate system of the ellipsoid;

        int originalIndex = sortedParticles[sortedIndex];
@@ -36,9 +36,9 @@ extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4*

        // Compute matrices we will need later.

-        real (*a)[3] = (real (*)[3]) (aMatrix+sortedIndex*9);
-        real (*b)[3] = (real (*)[3]) (bMatrix+sortedIndex*9);
-        real (*g)[3] = (real (*)[3]) (gMatrix+sortedIndex*9);
+        GLOBAL real (*a)[3] = (GLOBAL real (*)[3]) (aMatrix+sortedIndex*9);
+        GLOBAL real (*b)[3] = (GLOBAL real (*)[3]) (bMatrix+sortedIndex*9);
+        GLOBAL real (*g)[3] = (GLOBAL real (*)[3]) (gMatrix+sortedIndex*9);
        a[0][0] = xdir.x;
        a[0][1] = xdir.y;
        a[0][2] = xdir.z;
@@ -62,10 +62,10 @@ extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4*
 /**
 * Find a bounding box for the atoms in each block.
 */
-extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        const int* sortedAtoms, const real4* __restrict__ posq, real4* __restrict__ sortedPos, real4* __restrict__ blockCenter,
-        real4* __restrict__ blockBoundingBox, int* __restrict__ neighborBlockCount) {
-    int index = blockIdx.x*blockDim.x+threadIdx.x;
+KERNEL void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        GLOBAL const int* sortedAtoms, GLOBAL const real4* RESTRICT posq, GLOBAL real4* RESTRICT sortedPos, GLOBAL real4* RESTRICT blockCenter,
+        GLOBAL real4* RESTRICT blockBoundingBox, GLOBAL int* RESTRICT neighborBlockCount) {
+    int index = GLOBAL_ID;
    int base = index*TILE_SIZE;
    while (base < numAtoms) {
        real4 pos = posq[sortedAtoms[base]];
@@ -89,19 +89,19 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
        real4 blockSize = 0.5f*(maxPos-minPos);
        blockBoundingBox[index] = blockSize;
        blockCenter[index] = 0.5f*(maxPos+minPos);
-        index += blockDim.x*gridDim.x;
+        index += GLOBAL_SIZE;
        base = index*TILE_SIZE;
    }
-    if (blockIdx.x*blockDim.x+threadIdx.x == 0)
+    if (GLOBAL_ID == 0)
        *neighborBlockCount = 0;
 }

 /**
 * This is called by findNeighbors() to write a block to the neighbor list.
 */
-__device__ void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuffer, int maxNeighborBlocks, int* __restrict__ neighbors,
-        int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount) {
-    int blockIndex = atomicAdd(neighborBlockCount, 1);
+DEVICE void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuffer, int maxNeighborBlocks, GLOBAL int* RESTRICT neighbors,
+        GLOBAL int* RESTRICT neighborIndex, GLOBAL int* RESTRICT neighborBlockCount) {
+    int blockIndex = ATOMIC_ADD(neighborBlockCount, 1);
    if (blockIndex >= maxNeighborBlocks)
        return; // We don't have enough room for the neighbor list.
    neighborIndex[blockIndex] = atom1;
@@ -115,12 +115,12 @@ __device__ void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuf
 /**
 * Build a list of neighbors for each atom.
 */
-extern "C" __global__ void findNeighbors(int numAtoms, int maxNeighborBlocks, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        real4* __restrict__ sortedPos, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ neighbors,
-        int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount, const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex) {
+KERNEL void findNeighbors(int numAtoms, int maxNeighborBlocks, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        GLOBAL real4* RESTRICT sortedPos, GLOBAL real4* RESTRICT blockCenter, GLOBAL real4* RESTRICT blockBoundingBox, GLOBAL int* RESTRICT neighbors,
+        GLOBAL int* RESTRICT neighborIndex, GLOBAL int* RESTRICT neighborBlockCount, GLOBAL const int* RESTRICT exclusions, GLOBAL const int* RESTRICT exclusionStartIndex) {
    const int numBlocks = (numAtoms+TILE_SIZE-1)/TILE_SIZE;
    int neighborBuffer[NEIGHBOR_BLOCK_SIZE];
-    for (int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < numAtoms; atom1 += blockDim.x*gridDim.x) {
+    for (int atom1 = GLOBAL_ID; atom1 < numAtoms; atom1 += GLOBAL_SIZE) {
        int nextExclusion = exclusionStartIndex[atom1];
        int lastExclusion = exclusionStartIndex[atom1+1];
        real4 pos = sortedPos[atom1];
@@ -178,8 +178,8 @@ typedef struct {
    real a[3][3], b[3][3], g[3][3];
 } AtomData;

-__device__ void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, const real4* __restrict__ pos, const float4* __restrict__ sigParams,
-        const float2* __restrict__ epsParams, const real* __restrict__ aMatrix, const real* __restrict__ bMatrix, const real* __restrict__ gMatrix) {
+DEVICE void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, GLOBAL const real4* RESTRICT pos, GLOBAL const float4* RESTRICT sigParams,
+        GLOBAL const float2* RESTRICT epsParams, GLOBAL const real* RESTRICT aMatrix, GLOBAL const real* RESTRICT bMatrix, GLOBAL const real* RESTRICT gMatrix) {
    data->sig = sigParams[originalIndex];
    data->eps = epsParams[originalIndex];
    data->pos = trimTo3(pos[sortedIndex]);
@@ -192,19 +192,19 @@ __device__ void loadAtomData(AtomData* data, int sortedIndex, int originalIndex,
        }
 }

-inline __device__ real3 matrixVectorProduct(real (*m)[3], real3 v) {
+inline DEVICE real3 matrixVectorProduct(real (*m)[3], real3 v) {
    return make_real3(m[0][0]*v.x + m[0][1]*v.y + m[0][2]*v.z,
                      m[1][0]*v.x + m[1][1]*v.y + m[1][2]*v.z,
                      m[2][0]*v.x + m[2][1]*v.y + m[2][2]*v.z);
 }

-inline __device__ real3 vectorMatrixProduct(real3 v, real (*m)[3]) {
+inline DEVICE real3 vectorMatrixProduct(real3 v, real (*m)[3]) {
    return make_real3(m[0][0]*v.x + m[1][0]*v.y + m[2][0]*v.z,
                      m[0][1]*v.x + m[1][1]*v.y + m[2][1]*v.z,
                      m[0][2]*v.x + m[1][2]*v.y + m[2][2]*v.z);
 }

-inline __device__ void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) {
+inline DEVICE void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) {
    result[0][0] = a[0][0]+b[0][0];
    result[0][1] = a[0][1]+b[0][1];
    result[0][2] = a[0][2]+b[0][2];
@@ -216,12 +216,12 @@ inline __device__ void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3])
    result[2][2] = a[2][2]+b[2][2];
 }

-inline __device__ real determinant(real (*m)[3]) {
+inline DEVICE real determinant(real (*m)[3]) {
    return (m[0][0]*m[1][1]*m[2][2] + m[0][1]*m[1][2]*m[2][0] + m[0][2]*m[1][0]*m[2][1] -
            m[0][0]*m[1][2]*m[2][1] - m[0][1]*m[1][0]*m[2][2] - m[0][2]*m[1][1]*m[2][0]);
 }

-inline __device__ void matrixInverse(real (*result)[3], real (*m)[3]) {
+inline DEVICE void matrixInverse(real (*result)[3], real (*m)[3]) {
    real invDet = RECIP(determinant(m));
    result[0][0] = invDet*(m[1][1]*m[2][2] - m[1][2]*m[2][1]);
    result[1][0] = -invDet*(m[1][0]*m[2][2] - m[1][2]*m[2][0]);
@@ -234,7 +234,7 @@ inline __device__ void matrixInverse(real (*result)[3], real (*m)[3]) {
    result[2][2] = invDet*(m[0][0]*m[1][1] - m[0][1]*m[1][0]);
 }

-__device__ void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real epsilon, real3 dr, real r2, real3* force1, real3* force2, real3* torque1, real3* torque2, mixed *totalEnergy) {
+DEVICE void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real epsilon, real3 dr, real r2, real3* force1, real3* force2, real3* torque1, real3* torque2, mixed *totalEnergy) {
    real rInv = RSQRT(r2);
    real r = r2*rInv;
    real3 drUnit = dr*rInv;
@@ -335,25 +335,25 @@ __device__ void computeOneInteraction(AtomData* data1, AtomData* data2, real sig
 /**
 * Compute the interactions.
 */
-extern "C" __global__ void computeForce(
-        unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers,
-        int numAtoms, int numExceptions, mixed* __restrict__ energyBuffer, const real4* __restrict__ pos,
-        const float4* __restrict__ sigParams, const float2* __restrict__ epsParams, const int* __restrict__ sortedAtoms,
-        const real* __restrict__ aMatrix, const real* __restrict__ bMatrix, const real* __restrict__ gMatrix,
-        const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex,
-        const int4* __restrict__ exceptionParticles, const float2* __restrict__ exceptionParams
+KERNEL void computeForce(
+        GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT torqueBuffers,
+        int numAtoms, int numExceptions, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT pos,
+        GLOBAL const float4* RESTRICT sigParams, GLOBAL const float2* RESTRICT epsParams, GLOBAL const int* RESTRICT sortedAtoms,
+        GLOBAL const real* RESTRICT aMatrix, GLOBAL const real* RESTRICT bMatrix, GLOBAL const real* RESTRICT gMatrix,
+        GLOBAL const int* RESTRICT exclusions, GLOBAL const int* RESTRICT exclusionStartIndex,
+        GLOBAL const int4* RESTRICT exceptionParticles, GLOBAL const float2* RESTRICT exceptionParams
 #ifdef USE_CUTOFF
-        , int maxNeighborBlocks, int* __restrict__ neighbors, int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount,
+        , int maxNeighborBlocks, GLOBAL int* RESTRICT neighbors, GLOBAL int* RESTRICT neighborIndex, GLOBAL int* RESTRICT neighborBlockCount,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
 #endif
        ) {
-    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+    const unsigned int warp = GLOBAL_ID/TILE_SIZE;
    mixed energy = 0;
 #ifdef USE_CUTOFF
    const int numBlocks = *neighborBlockCount;
    if (numBlocks > maxNeighborBlocks)
        return; // There wasn't enough memory for the neighbor list.
-    for (int block = blockIdx.x*blockDim.x+threadIdx.x; block < numBlocks; block += blockDim.x*gridDim.x) {
+    for (int block = GLOBAL_ID; block < numBlocks; block += GLOBAL_SIZE) {
        // Load parameters for atom1.
        
        int atom1 = neighborIndex[block];
@@ -384,22 +384,22 @@ extern "C" __global__ void computeForce(
            real sigma = data1.sig.x+data2.sig.x;
            real epsilon = data1.eps.x*data2.eps.x;
            computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
-            atomicAdd(&forceBuffers[index2], static_cast<unsigned long long>((long long) (force2.x*0x100000000)));
-            atomicAdd(&forceBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.y*0x100000000)));
-            atomicAdd(&forceBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.z*0x100000000)));
-            atomicAdd(&torqueBuffers[index2], static_cast<unsigned long long>((long long) (torque2.x*0x100000000)));
-            atomicAdd(&torqueBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.y*0x100000000)));
-            atomicAdd(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.z*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[index2], (mm_ulong) ((mm_long) (force2.x*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.y*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.z*0x100000000)));
+            ATOMIC_ADD(&torqueBuffers[index2], (mm_ulong) ((mm_long) (torque2.x*0x100000000)));
+            ATOMIC_ADD(&torqueBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.y*0x100000000)));
+            ATOMIC_ADD(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.z*0x100000000)));
        }
-        atomicAdd(&forceBuffers[index1], static_cast<unsigned long long>((long long) (force1.x*0x100000000)));
-        atomicAdd(&forceBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.y*0x100000000)));
-        atomicAdd(&forceBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.z*0x100000000)));
-        atomicAdd(&torqueBuffers[index1], static_cast<unsigned long long>((long long) (torque1.x*0x100000000)));
-        atomicAdd(&torqueBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.y*0x100000000)));
-        atomicAdd(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.z*0x100000000)));
+        ATOMIC_ADD(&forceBuffers[index1], (mm_ulong) ((mm_long) (force1.x*0x100000000)));
+        ATOMIC_ADD(&forceBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.y*0x100000000)));
+        ATOMIC_ADD(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.z*0x100000000)));
+        ATOMIC_ADD(&torqueBuffers[index1], (mm_ulong) ((mm_long) (torque1.x*0x100000000)));
+        ATOMIC_ADD(&torqueBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.y*0x100000000)));
+        ATOMIC_ADD(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.z*0x100000000)));
    }
 #else
-    for (int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < numAtoms; atom1 += blockDim.x*gridDim.x) {
+    for (int atom1 = GLOBAL_ID; atom1 < numAtoms; atom1 += GLOBAL_SIZE) {
        // Load parameters for atom1.
        
        int index1 = sortedAtoms[atom1];
@@ -432,25 +432,25 @@ extern "C" __global__ void computeForce(
            real sigma = data1.sig.x+data2.sig.x;
            real epsilon = data1.eps.x*data2.eps.x;
            computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
-            atomicAdd(&forceBuffers[index2], static_cast<unsigned long long>((long long) (force2.x*0x100000000)));
-            atomicAdd(&forceBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.y*0x100000000)));
-            atomicAdd(&forceBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.z*0x100000000)));
-            atomicAdd(&torqueBuffers[index2], static_cast<unsigned long long>((long long) (torque2.x*0x100000000)));
-            atomicAdd(&torqueBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.y*0x100000000)));
-            atomicAdd(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.z*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[index2], (mm_ulong) ((mm_long) (force2.x*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.y*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.z*0x100000000)));
+            ATOMIC_ADD(&torqueBuffers[index2], (mm_ulong) ((mm_long) (torque2.x*0x100000000)));
+            ATOMIC_ADD(&torqueBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.y*0x100000000)));
+            ATOMIC_ADD(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.z*0x100000000)));
        }
-        atomicAdd(&forceBuffers[index1], static_cast<unsigned long long>((long long) (force1.x*0x100000000)));
-        atomicAdd(&forceBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.y*0x100000000)));
-        atomicAdd(&forceBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.z*0x100000000)));
-        atomicAdd(&torqueBuffers[index1], static_cast<unsigned long long>((long long) (torque1.x*0x100000000)));
-        atomicAdd(&torqueBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.y*0x100000000)));
-        atomicAdd(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.z*0x100000000)));
+        ATOMIC_ADD(&forceBuffers[index1], (mm_ulong) ((mm_long) (force1.x*0x100000000)));
+        ATOMIC_ADD(&forceBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.y*0x100000000)));
+        ATOMIC_ADD(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.z*0x100000000)));
+        ATOMIC_ADD(&torqueBuffers[index1], (mm_ulong) ((mm_long) (torque1.x*0x100000000)));
+        ATOMIC_ADD(&torqueBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.y*0x100000000)));
+        ATOMIC_ADD(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.z*0x100000000)));
    }
 #endif
    
    // Now compute exceptions.
    
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numExceptions; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < numExceptions; index += GLOBAL_SIZE) {
        int4 atomIndices = exceptionParticles[index];
        float2 params = exceptionParams[index];
        int index1 = atomIndices.x, index2 = atomIndices.y;
@@ -466,34 +466,34 @@ extern "C" __global__ void computeForce(
        if (r2 < CUTOFF_SQUARED) {
 #endif
            computeOneInteraction(&data1, &data2, params.x, params.y, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
-            atomicAdd(&forceBuffers[index1], static_cast<unsigned long long>((long long) (force1.x*0x100000000)));
-            atomicAdd(&forceBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.y*0x100000000)));
-            atomicAdd(&forceBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.z*0x100000000)));
-            atomicAdd(&forceBuffers[index2], static_cast<unsigned long long>((long long) (force2.x*0x100000000)));
-            atomicAdd(&forceBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.y*0x100000000)));
-            atomicAdd(&forceBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.z*0x100000000)));
-            atomicAdd(&torqueBuffers[index1], static_cast<unsigned long long>((long long) (torque1.x*0x100000000)));
-            atomicAdd(&torqueBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.y*0x100000000)));
-            atomicAdd(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.z*0x100000000)));
-            atomicAdd(&torqueBuffers[index2], static_cast<unsigned long long>((long long) (torque2.x*0x100000000)));
-            atomicAdd(&torqueBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.y*0x100000000)));
-            atomicAdd(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.z*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[index1], (mm_ulong) ((mm_long) (force1.x*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.y*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.z*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[index2], (mm_ulong) ((mm_long) (force2.x*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.y*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.z*0x100000000)));
+            ATOMIC_ADD(&torqueBuffers[index1], (mm_ulong) ((mm_long) (torque1.x*0x100000000)));
+            ATOMIC_ADD(&torqueBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.y*0x100000000)));
+            ATOMIC_ADD(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.z*0x100000000)));
+            ATOMIC_ADD(&torqueBuffers[index2], (mm_ulong) ((mm_long) (torque2.x*0x100000000)));
+            ATOMIC_ADD(&torqueBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.y*0x100000000)));
+            ATOMIC_ADD(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.z*0x100000000)));
 #ifdef USE_CUTOFF
        }
 #endif
    }
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+    energyBuffer[GLOBAL_ID] += energy;
 }

 /**
 * Convert the torques to forces on the connected particles.
 */
-extern "C" __global__ void applyTorques(
-        unsigned long long* __restrict__ forceBuffers, long long* __restrict__ torqueBuffers,
-        int numParticles, const real4* __restrict__ posq, int2* const __restrict__ axisParticleIndices,
-        const int* sortedParticles) {
-    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
-    for (int sortedIndex = blockIdx.x*blockDim.x+threadIdx.x; sortedIndex < numParticles; sortedIndex += blockDim.x*gridDim.x) {
+KERNEL void applyTorques(
+        GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL const mm_long* RESTRICT torqueBuffers,
+        int numParticles, GLOBAL const real4* RESTRICT posq, GLOBAL int2* const RESTRICT axisParticleIndices,
+        GLOBAL const int* sortedParticles) {
+    const unsigned int warp = GLOBAL_ID/TILE_SIZE;
+    for (int sortedIndex = GLOBAL_ID; sortedIndex < numParticles; sortedIndex += GLOBAL_SIZE) {
        int originalIndex = sortedParticles[sortedIndex];
        real3 pos = trimTo3(posq[originalIndex]);
        int2 axisParticles = axisParticleIndices[originalIndex];
@@ -522,16 +522,16 @@ extern "C" __global__ void applyTorques(
                yforce += f;
                force -= f;
            }
-            atomicAdd(&forceBuffers[originalIndex], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
-            atomicAdd(&forceBuffers[originalIndex+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
-            atomicAdd(&forceBuffers[originalIndex+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
-            atomicAdd(&forceBuffers[axisParticles.x], static_cast<unsigned long long>((long long) (xforce.x*0x100000000)));
-            atomicAdd(&forceBuffers[axisParticles.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (xforce.y*0x100000000)));
-            atomicAdd(&forceBuffers[axisParticles.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (xforce.z*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[originalIndex], (mm_ulong) ((mm_long) (force.x*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[originalIndex+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[originalIndex+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[axisParticles.x], (mm_ulong) ((mm_long) (xforce.x*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[axisParticles.x+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (xforce.y*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[axisParticles.x+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (xforce.z*0x100000000)));
            if (axisParticles.y != -1) {
-                atomicAdd(&forceBuffers[axisParticles.y], static_cast<unsigned long long>((long long) (yforce.x*0x100000000)));
-                atomicAdd(&forceBuffers[axisParticles.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (yforce.y*0x100000000)));
-                atomicAdd(&forceBuffers[axisParticles.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (yforce.z*0x100000000)));
+                ATOMIC_ADD(&forceBuffers[axisParticles.y], (mm_ulong) ((mm_long) (yforce.x*0x100000000)));
+                ATOMIC_ADD(&forceBuffers[axisParticles.y+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (yforce.y*0x100000000)));
+                ATOMIC_ADD(&forceBuffers[axisParticles.y+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (yforce.z*0x100000000)));
            }
        }
    }

--- a/platforms/opencl/src/kernels/gbsaObc.cl
+++ b/platforms/opencl/src/kernels/gbsaObc.cl
-#ifdef SUPPORTS_64_BIT_ATOMICS
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-#endif
 #define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)

 typedef struct {
@@ -13,26 +10,26 @@ typedef struct {
 /**
 * Compute the Born sum.
 */
-__kernel void computeBornSum(
+KERNEL void computeBornSum(
 #ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict global_bornSum,
+        GLOBAL mm_ulong* RESTRICT global_bornSum,
 #else
-        __global real* restrict global_bornSum,
+        GLOBAL real* RESTRICT global_bornSum,
 #endif
-        __global const real4* restrict posq, __global const real* restrict charge, __global const float2* restrict global_params,
+        GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
 #ifdef USE_CUTOFF
-        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
-        __global const real4* restrict blockSize, __global const int* restrict interactingAtoms,
+        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
+        GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
 #else
        unsigned int numTiles,
 #endif
-        __global const ushort2* exclusionTiles) {
-    const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
-    const unsigned int warp = get_global_id(0)/TILE_SIZE;
-    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-    const unsigned int tbx = get_local_id(0) - tgx;
-    __local AtomData1 localData[FORCE_WORK_GROUP_SIZE];
+        GLOBAL const ushort2* RESTRICT exclusionTiles) {
+    const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
+    const unsigned int warp = GLOBAL_ID/TILE_SIZE;
+    const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
+    const unsigned int tbx = LOCAL_ID - tgx;
+    LOCAL AtomData1 localData[FORCE_WORK_GROUP_SIZE];

    // First loop: process tiles that contain exclusions.
    
@@ -42,7 +39,7 @@ __kernel void computeBornSum(
        const ushort2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
-        real bornSum = 0.0f;
+        real bornSum = 0;
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
        real charge1 = charge[atom1];
@@ -50,15 +47,15 @@ __kernel void computeBornSum(
        if (x == y) {
            // This tile is on the diagonal.

-            localData[get_local_id(0)].x = posq1.x;
-            localData[get_local_id(0)].y = posq1.y;
-            localData[get_local_id(0)].z = posq1.z;
-            localData[get_local_id(0)].q = charge1;
-            localData[get_local_id(0)].radius = params1.x;
-            localData[get_local_id(0)].scaledRadius = params1.y;
+            localData[LOCAL_ID].x = posq1.x;
+            localData[LOCAL_ID].y = posq1.y;
+            localData[LOCAL_ID].z = posq1.z;
+            localData[LOCAL_ID].q = charge1;
+            localData[LOCAL_ID].radius = params1.x;
+            localData[LOCAL_ID].scaledRadius = params1.y;
            SYNC_WARPS;
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                real4 delta = (real4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0);
+                real3 delta = make_real3(localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z);
 #ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -70,7 +67,7 @@ __kernel void computeBornSum(
 #endif
                    real invR = RSQRT(r2);
                    real r = r2*invR;
-                    float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
+                    float2 params2 = make_float2(localData[tbx+j].radius, localData[tbx+j].scaledRadius);
                    real rScaledRadiusJ = r+params2.y;
                    if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
                        real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
@@ -91,21 +88,21 @@ __kernel void computeBornSum(

            unsigned int j = y*TILE_SIZE + tgx;
            real4 tempPosq = posq[j];
-            localData[get_local_id(0)].x = tempPosq.x;
-            localData[get_local_id(0)].y = tempPosq.y;
-            localData[get_local_id(0)].z = tempPosq.z;
-            localData[get_local_id(0)].q = charge[j];
+            localData[LOCAL_ID].x = tempPosq.x;
+            localData[LOCAL_ID].y = tempPosq.y;
+            localData[LOCAL_ID].z = tempPosq.z;
+            localData[LOCAL_ID].q = charge[j];
            float2 tempParams = global_params[j];
-            localData[get_local_id(0)].radius = tempParams.x;
-            localData[get_local_id(0)].scaledRadius = tempParams.y;
-            localData[get_local_id(0)].bornSum = 0.0f;
+            localData[LOCAL_ID].radius = tempParams.x;
+            localData[LOCAL_ID].scaledRadius = tempParams.y;
+            localData[LOCAL_ID].bornSum = 0.0f;
            SYNC_WARPS;

            // Compute the full set of interactions in this tile.

            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
-                real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0);
+                real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
 #ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -117,7 +114,7 @@ __kernel void computeBornSum(
 #endif
                    real invR = RSQRT(r2);
                    real r = r2*invR;
-                    float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
+                    float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
                    real rScaledRadiusJ = r+params2.y;
                    if (params1.x < rScaledRadiusJ) {
                        real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
@@ -151,17 +148,17 @@ __kernel void computeBornSum(

 #ifdef SUPPORTS_64_BIT_ATOMICS
        unsigned int offset = x*TILE_SIZE + tgx;
-        atom_add(&global_bornSum[offset], (long) (bornSum*0x100000000));
+        ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) ((mm_long) (bornSum*0x100000000)));
        if (x != y) {
            offset = y*TILE_SIZE + tgx;
-            atom_add(&global_bornSum[offset], (long) (localData[get_local_id(0)].bornSum*0x100000000));
+            ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) ((mm_long) (localData[LOCAL_ID].bornSum*0x100000000)));
        }
 #else
        unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
        unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
        global_bornSum[offset1] += bornSum;
        if (x != y)
-            global_bornSum[offset2] += localData[get_local_id(0)].bornSum;
+            global_bornSum[offset2] += localData[LOCAL_ID].bornSum;
 #endif
    }

@@ -172,17 +169,17 @@ __kernel void computeBornSum(
    unsigned int numTiles = interactionCount[0];
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
-    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
-    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
+    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
+    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
 #else
-    int pos = (int) (warp*(long)numTiles/totalWarps);
-    int end = (int) ((warp+1)*(long)numTiles/totalWarps);
+    int pos = (int) (warp*(mm_long)numTiles/totalWarps);
+    int end = (int) ((warp+1)*(mm_long)numTiles/totalWarps);
 #endif
    int skipBase = 0;
    int currentSkipIndex = tbx;
-    __local int atomIndices[FORCE_WORK_GROUP_SIZE];
-    __local volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
-    skipTiles[get_local_id(0)] = -1;
+    LOCAL int atomIndices[FORCE_WORK_GROUP_SIZE];
+    LOCAL volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
+    skipTiles[LOCAL_ID] = -1;

    while (pos < end) {
        real bornSum = 0;
@@ -213,10 +210,10 @@ __kernel void computeBornSum(
            SYNC_WARPS;
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
                ushort2 tile = exclusionTiles[skipBase+tgx];
-                skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
            else
-                skipTiles[get_local_id(0)] = end;
+                skipTiles[LOCAL_ID] = end;
            skipBase += TILE_SIZE;            
            currentSkipIndex = tbx;
            SYNC_WARPS;
@@ -238,17 +235,17 @@ __kernel void computeBornSum(
 #else
            unsigned int j = y*TILE_SIZE + tgx;
 #endif
-            atomIndices[get_local_id(0)] = j;
+            atomIndices[LOCAL_ID] = j;
            if (j < PADDED_NUM_ATOMS) {
                real4 tempPosq = posq[j];
-                localData[get_local_id(0)].x = tempPosq.x;
-                localData[get_local_id(0)].y = tempPosq.y;
-                localData[get_local_id(0)].z = tempPosq.z;
-                localData[get_local_id(0)].q = charge[j];
+                localData[LOCAL_ID].x = tempPosq.x;
+                localData[LOCAL_ID].y = tempPosq.y;
+                localData[LOCAL_ID].z = tempPosq.z;
+                localData[LOCAL_ID].q = charge[j];
                float2 tempParams = global_params[j];
-                localData[get_local_id(0)].radius = tempParams.x;
-                localData[get_local_id(0)].scaledRadius = tempParams.y;
-                localData[get_local_id(0)].bornSum = 0.0f;
+                localData[LOCAL_ID].radius = tempParams.x;
+                localData[LOCAL_ID].scaledRadius = tempParams.y;
+                localData[LOCAL_ID].bornSum = 0.0f;
            }
            SYNC_WARPS;
 #ifdef USE_PERIODIC
@@ -258,17 +255,17 @@ __kernel void computeBornSum(

                real4 blockCenterX = blockCenter[x];
                APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
-                APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[get_local_id(0)], blockCenterX)
+                APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[LOCAL_ID], blockCenterX)
                SYNC_WARPS;
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
-                    real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0);
+                    real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                    int atom2 = atomIndices[tbx+tj];
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
                        real invR = RSQRT(r2);
                        real r = r2*invR;
-                        float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
+                        float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
                        real rScaledRadiusJ = r+params2.y;
                        if (params1.x < rScaledRadiusJ) {
                            real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
@@ -304,7 +301,7 @@ __kernel void computeBornSum(

                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
-                    real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0);
+                    real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -317,7 +314,7 @@ __kernel void computeBornSum(
 #endif
                        real invR = RSQRT(r2);
                        real r = r2*invR;
-                        float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
+                        float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
                        real rScaledRadiusJ = r+params2.y;
                        if (params1.x < rScaledRadiusJ) {
                            real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
@@ -350,20 +347,20 @@ __kernel void computeBornSum(
            // Write results.

 #ifdef USE_CUTOFF
-            unsigned int atom2 = atomIndices[get_local_id(0)];
+            unsigned int atom2 = atomIndices[LOCAL_ID];
 #else
            unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
 #ifdef SUPPORTS_64_BIT_ATOMICS
-            atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
+            ATOMIC_ADD(&global_bornSum[atom1], (mm_ulong) ((mm_long) (bornSum*0x100000000)));
            if (atom2 < PADDED_NUM_ATOMS)
-                atom_add(&global_bornSum[atom2], (long) (localData[get_local_id(0)].bornSum*0x100000000));
+                ATOMIC_ADD(&global_bornSum[atom2], (mm_ulong) ((mm_long) (localData[LOCAL_ID].bornSum*0x100000000)));
 #else
            unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
            unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
            global_bornSum[offset1] += bornSum;
            if (atom2 < PADDED_NUM_ATOMS)
-                global_bornSum[offset2] += localData[get_local_id(0)].bornSum;
+                global_bornSum[offset2] += localData[LOCAL_ID].bornSum;
 #endif
        }
        pos++;
@@ -381,28 +378,28 @@ typedef struct {
 * First part of computing the GBSA interaction.
 */

-__kernel void computeGBSAForce1(
+KERNEL void computeGBSAForce1(
 #ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict forceBuffers, __global long* restrict global_bornForce,
+        GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT global_bornForce,
 #else
-        __global real4* restrict forceBuffers, __global real* restrict global_bornForce,
+        GLOBAL real4* RESTRICT forceBuffers, GLOBAL real* RESTRICT global_bornForce,
 #endif
-        __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict charge,
-        __global const real* restrict global_bornRadii, int needEnergy,
+        GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
+        GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
 #ifdef USE_CUTOFF
-        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
-        __global const real4* restrict blockSize, __global const int* restrict interactingAtoms,
+        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
+        GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
 #else
        unsigned int numTiles,
 #endif
-        __global const ushort2* exclusionTiles) {
-    const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
-    const unsigned int warp = get_global_id(0)/TILE_SIZE;
-    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-    const unsigned int tbx = get_local_id(0) - tgx;
+        GLOBAL const ushort2* RESTRICT exclusionTiles) {
+    const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
+    const unsigned int warp = GLOBAL_ID/TILE_SIZE;
+    const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
+    const unsigned int tbx = LOCAL_ID - tgx;
    mixed energy = 0;
-    __local AtomData2 localData[FORCE_WORK_GROUP_SIZE];
+    LOCAL AtomData2 localData[FORCE_WORK_GROUP_SIZE];

    // First loop: process tiles that contain exclusions.
    
@@ -412,7 +409,7 @@ __kernel void computeGBSAForce1(
        const ushort2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
-        real4 force = 0.0f;
+        real4 force = make_real4(0);
        unsigned int atom1 = x*TILE_SIZE + tgx;
        real4 posq1 = posq[atom1];
        real charge1 = charge[atom1];
@@ -420,18 +417,17 @@ __kernel void computeGBSAForce1(
        if (x == y) {
            // This tile is on the diagonal.

-            const unsigned int localAtomIndex = get_local_id(0);
-            localData[localAtomIndex].x = posq1.x;
-            localData[localAtomIndex].y = posq1.y;
-            localData[localAtomIndex].z = posq1.z;
-            localData[localAtomIndex].q = charge1;
-            localData[get_local_id(0)].bornRadius = bornRadius1;
+            localData[LOCAL_ID].x = posq1.x;
+            localData[LOCAL_ID].y = posq1.y;
+            localData[LOCAL_ID].z = posq1.z;
+            localData[LOCAL_ID].q = charge1;
+            localData[LOCAL_ID].bornRadius = bornRadius1;
            SYNC_WARPS;
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
-                    real3 pos2 = (real3) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z);
+                    real3 pos2 = make_real3(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z);
                    real charge2 = localData[tbx+j].q;
-                    real4 delta = (real4) (pos2 - posq1.xyz, 0);
+                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -459,8 +455,10 @@ __kernel void computeGBSAForce1(
 #endif
                        if (needEnergy)
                            energy += 0.5f*tempEnergy;
-                        delta.xyz *= dEdR;
-                        force.xyz -= delta.xyz;
+                        delta *= dEdR;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
 #ifdef USE_CUTOFF
                    }
 #endif
@@ -473,22 +471,22 @@ __kernel void computeGBSAForce1(

            unsigned int j = y*TILE_SIZE + tgx;
            real4 tempPosq = posq[j];
-            localData[get_local_id(0)].x = tempPosq.x;
-            localData[get_local_id(0)].y = tempPosq.y;
-            localData[get_local_id(0)].z = tempPosq.z;
-            localData[get_local_id(0)].q = charge[j];
-            localData[get_local_id(0)].bornRadius = global_bornRadii[j];
-            localData[get_local_id(0)].fx = 0.0f;
-            localData[get_local_id(0)].fy = 0.0f;
-            localData[get_local_id(0)].fz = 0.0f;
-            localData[get_local_id(0)].fw = 0.0f;
+            localData[LOCAL_ID].x = tempPosq.x;
+            localData[LOCAL_ID].y = tempPosq.y;
+            localData[LOCAL_ID].z = tempPosq.z;
+            localData[LOCAL_ID].q = charge[j];
+            localData[LOCAL_ID].bornRadius = global_bornRadii[j];
+            localData[LOCAL_ID].fx = 0.0f;
+            localData[LOCAL_ID].fy = 0.0f;
+            localData[LOCAL_ID].fz = 0.0f;
+            localData[LOCAL_ID].fw = 0.0f;
            SYNC_WARPS;
            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
                if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
-                    real3 pos2 = (real3) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
+                    real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
                    real charge2 = localData[tbx+tj].q;
-                    real4 delta = (real4) (pos2 - posq1.xyz, 0);
+                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -515,8 +513,10 @@ __kernel void computeGBSAForce1(
 #endif
                        if (needEnergy)
                            energy += tempEnergy;
-                        delta.xyz *= dEdR;
-                        force.xyz -= delta.xyz;
+                        delta *= dEdR;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
                        localData[tbx+tj].fx += delta.x;
                        localData[tbx+tj].fy += delta.y;
                        localData[tbx+tj].fz += delta.z;
@@ -534,25 +534,25 @@ __kernel void computeGBSAForce1(
        
 #ifdef SUPPORTS_64_BIT_ATOMICS
        unsigned int offset = x*TILE_SIZE + tgx;
-        atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
-        atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-        atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
-        atom_add(&global_bornForce[offset], (long) (force.w*0x100000000));
+        ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
+        ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
+        ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
+        ATOMIC_ADD(&global_bornForce[offset], (mm_ulong) ((mm_long) (force.w*0x100000000)));
        if (x != y) {
            offset = y*TILE_SIZE + tgx;
-            atom_add(&forceBuffers[offset], (long) (localData[get_local_id(0)].fx*0x100000000));
-            atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
-            atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
-            atom_add(&global_bornForce[offset], (long) (localData[get_local_id(0)].fw*0x100000000));
+            ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fx*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fy*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fz*0x100000000)));
+            ATOMIC_ADD(&global_bornForce[offset], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fw*0x100000000)));
        }
 #else
        unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
        unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
-        forceBuffers[offset1].xyz += force.xyz;
+        forceBuffers[offset1] += make_real4(force.x, force.y, force.z, 0);
        global_bornForce[offset1] += force.w;
        if (x != y) {
-            forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
-            global_bornForce[offset2] += localData[get_local_id(0)].fw;
+            forceBuffers[offset2] += (real4) (localData[LOCAL_ID].fx, localData[LOCAL_ID].fy, localData[LOCAL_ID].fz, 0.0f);
+            global_bornForce[offset2] += localData[LOCAL_ID].fw;
        }
 #endif
    }
@@ -564,20 +564,20 @@ __kernel void computeGBSAForce1(
    unsigned int numTiles = interactionCount[0];
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
-    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
-    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
+    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
+    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
 #else
-    int pos = (int) (warp*(long)numTiles/totalWarps);
-    int end = (int) ((warp+1)*(long)numTiles/totalWarps);
+    int pos = (int) (warp*(mm_long)numTiles/totalWarps);
+    int end = (int) ((warp+1)*(mm_long)numTiles/totalWarps);
 #endif
    int skipBase = 0;
    int currentSkipIndex = tbx;
-    __local int atomIndices[FORCE_WORK_GROUP_SIZE];
-    __local volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
-    skipTiles[get_local_id(0)] = -1;
+    LOCAL int atomIndices[FORCE_WORK_GROUP_SIZE];
+    LOCAL volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
+    skipTiles[LOCAL_ID] = -1;

    while (pos < end) {
-        real4 force = 0;
+        real4 force = make_real4(0);
        bool includeTile = true;

        // Extract the coordinates of this tile.
@@ -605,10 +605,10 @@ __kernel void computeGBSAForce1(
            SYNC_WARPS;
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
                ushort2 tile = exclusionTiles[skipBase+tgx];
-                skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
            else
-                skipTiles[get_local_id(0)] = end;
+                skipTiles[LOCAL_ID] = end;
            skipBase += TILE_SIZE;            
            currentSkipIndex = tbx;
            SYNC_WARPS;
@@ -630,18 +630,18 @@ __kernel void computeGBSAForce1(
 #else
            unsigned int j = y*TILE_SIZE + tgx;
 #endif
-            atomIndices[get_local_id(0)] = j;
+            atomIndices[LOCAL_ID] = j;
            if (j < PADDED_NUM_ATOMS) {
                real4 tempPosq = posq[j];
-                localData[get_local_id(0)].x = tempPosq.x;
-                localData[get_local_id(0)].y = tempPosq.y;
-                localData[get_local_id(0)].z = tempPosq.z;
-                localData[get_local_id(0)].q = charge[j];
-                localData[get_local_id(0)].bornRadius = global_bornRadii[j];
-                localData[get_local_id(0)].fx = 0.0f;
-                localData[get_local_id(0)].fy = 0.0f;
-                localData[get_local_id(0)].fz = 0.0f;
-                localData[get_local_id(0)].fw = 0.0f;
+                localData[LOCAL_ID].x = tempPosq.x;
+                localData[LOCAL_ID].y = tempPosq.y;
+                localData[LOCAL_ID].z = tempPosq.z;
+                localData[LOCAL_ID].q = charge[j];
+                localData[LOCAL_ID].bornRadius = global_bornRadii[j];
+                localData[LOCAL_ID].fx = 0.0f;
+                localData[LOCAL_ID].fy = 0.0f;
+                localData[LOCAL_ID].fz = 0.0f;
+                localData[LOCAL_ID].fw = 0.0f;
            }
            SYNC_WARPS;
 #ifdef USE_PERIODIC
@@ -651,15 +651,15 @@ __kernel void computeGBSAForce1(

                real4 blockCenterX = blockCenter[x];
                APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
-                APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[get_local_id(0)], blockCenterX)
+                APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[LOCAL_ID], blockCenterX)
                SYNC_WARPS;
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = atomIndices[tbx+tj];
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                        real3 pos2 = (real3) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
+                        real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
                        real charge2 = localData[tbx+tj].q;
-                        real4 delta = (real4) (pos2 - posq1.xyz, 0);
+                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        if (r2 < CUTOFF_SQUARED) {
                            real invR = RSQRT(r2);
@@ -681,8 +681,10 @@ __kernel void computeGBSAForce1(
 #endif
                            if (needEnergy)
                                energy += tempEnergy;
-                            delta.xyz *= dEdR;
-                            force.xyz -= delta.xyz;
+                            delta *= dEdR;
+                            force.x -= delta.x;
+                            force.y -= delta.y;
+                            force.z -= delta.z;
                            localData[tbx+tj].fx += delta.x;
                            localData[tbx+tj].fy += delta.y;
                            localData[tbx+tj].fz += delta.z;
@@ -702,9 +704,9 @@ __kernel void computeGBSAForce1(
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = atomIndices[tbx+tj];
                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
-                        real3 pos2 = (real3) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
+                        real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
                        real charge2 = localData[tbx+tj].q;
-                        real4 delta = (real4) (pos2 - posq1.xyz, 0);
+                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
 #ifdef USE_PERIODIC
                        APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -731,8 +733,10 @@ __kernel void computeGBSAForce1(
 #endif
                            if (needEnergy)
                                energy += tempEnergy;
-                            delta.xyz *= dEdR;
-                            force.xyz -= delta.xyz;
+                            delta *= dEdR;
+                            force.x -= delta.x;
+                            force.y -= delta.y;
+                            force.z -= delta.z;
                            localData[tbx+tj].fx += delta.x;
                            localData[tbx+tj].fy += delta.y;
                            localData[tbx+tj].fz += delta.z;
@@ -745,37 +749,37 @@ __kernel void computeGBSAForce1(
                    SYNC_WARPS;
                }
            }
-        
+
            // Write results.
-        
+
 #ifdef USE_CUTOFF
-            unsigned int atom2 = atomIndices[get_local_id(0)];
+            unsigned int atom2 = atomIndices[LOCAL_ID];
 #else
            unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
 #ifdef SUPPORTS_64_BIT_ATOMICS
-            atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
-            atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-            atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
-            atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
+            ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) ((mm_long) (force.x*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
+            ATOMIC_ADD(&global_bornForce[atom1], (mm_ulong) ((mm_long) (force.w*0x100000000)));
            if (atom2 < PADDED_NUM_ATOMS) {
-                atom_add(&forceBuffers[atom2], (long) (localData[get_local_id(0)].fx*0x100000000));
-                atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
-                atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
-                atom_add(&global_bornForce[atom2], (long) (localData[get_local_id(0)].fw*0x100000000));
+                ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fx*0x100000000)));
+                ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fy*0x100000000)));
+                ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fz*0x100000000)));
+                ATOMIC_ADD(&global_bornForce[atom2], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fw*0x100000000)));
            }
 #else
            unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
            unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
-            forceBuffers[offset1].xyz += force.xyz;
+            forceBuffers[offset1] += make_real4(force.x, force.y, force.z, 0);
            global_bornForce[offset1] += force.w;
            if (atom2 < PADDED_NUM_ATOMS) {
-                forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
-                global_bornForce[offset2] += localData[get_local_id(0)].fw;
+                forceBuffers[offset2] += (real4) (localData[LOCAL_ID].fx, localData[LOCAL_ID].fy, localData[LOCAL_ID].fz, 0.0f);
+                global_bornForce[offset2] += localData[LOCAL_ID].fw;
            }
 #endif
        }
        pos++;
    }
-    energyBuffer[get_global_id(0)] += energy;
+    energyBuffer[GLOBAL_ID] += energy;
 }
--- a/platforms/cuda/src/kernels/gbsaObc2.cu
+++ b/platforms/cuda/src/kernels/gbsaObc2.cu
@@ -2,8 +2,8 @@
    real invRSquaredOver4 = 0.25f*invR*invR;
    real rScaledRadiusJ = r+OBC_PARAMS2.y;
    real rScaledRadiusI = r+OBC_PARAMS1.y;
-    real l_ijJ = RECIP(max(OBC_PARAMS1.x, fabs(r-OBC_PARAMS2.y)));
-    real l_ijI = RECIP(max(OBC_PARAMS2.x, fabs(r-OBC_PARAMS1.y)));
+    real l_ijJ = RECIP(max((real) OBC_PARAMS1.x, fabs(r-OBC_PARAMS2.y)));
+    real l_ijI = RECIP(max((real) OBC_PARAMS2.x, fabs(r-OBC_PARAMS1.y)));
    real u_ijJ = RECIP(rScaledRadiusJ);
    real u_ijI = RECIP(rScaledRadiusI);
    real l_ij2J = l_ijJ*l_ijJ;
@@ -16,12 +16,17 @@
    real t2I = (l_ij2I-u_ij2I);
    real term1 = (0.5f*(0.25f+OBC_PARAMS2.y*OBC_PARAMS2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
    real term2 = (0.5f*(0.25f+OBC_PARAMS1.y*OBC_PARAMS1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
+#ifdef SUPPORTS_64_BIT_ATOMICS
    real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1/0x100000000 : 0);
    tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2/0x100000000 : 0);
+#else
+    real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1 : (real) 0);
+    tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2 : (real) 0);
+#endif
 #ifdef USE_CUTOFF
    unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
 #else
    unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2);
 #endif
-    dEdR += (includeInteraction ? tempdEdR : 0);
+    dEdR += (includeInteraction ? tempdEdR : (real) 0);
 }
--- a/platforms/opencl/src/kernels/gbsaObcReductions.cl
+++ b/platforms/opencl/src/kernels/gbsaObcReductions.cl
@@ -5,22 +5,21 @@
 * Reduce the Born sums to compute the Born radii.
 */

-__kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float beta, float gamma,
+KERNEL void reduceBornSum(float alpha, float beta, float gamma,
 #ifdef SUPPORTS_64_BIT_ATOMICS
-            __global const long* restrict bornSum,
+            GLOBAL const mm_long* RESTRICT bornSum,
 #else
-            __global const real* restrict bornSum,
+            GLOBAL const real* RESTRICT bornSum, int bufferSize, int numBuffers,
 #endif
-            __global const float2* restrict params, __global real* restrict bornRadii, __global real* restrict obcChain) {
-    unsigned int index = get_global_id(0);
-    while (index < NUM_ATOMS) {
+            GLOBAL const float2* RESTRICT params, GLOBAL real* RESTRICT bornRadii, GLOBAL real* RESTRICT obcChain) {
+    for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
        // Get summed Born data

-        int totalSize = bufferSize*numBuffers;
 #ifdef SUPPORTS_64_BIT_ATOMICS
-        real sum = (1/(real) 0x100000000)*bornSum[index];
+        real sum = RECIP((real) 0x100000000)*bornSum[index];
 #else
        real sum = bornSum[index];
+        int totalSize = bufferSize*numBuffers;
        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
            sum += bornSum[i];
 #endif
@@ -33,12 +32,11 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
        real sum3 = sum*sum2;
        real tanhSum = tanh(alpha*sum - beta*sum2 + gamma*sum3);
        real nonOffsetRadius = offsetRadius + DIELECTRIC_OFFSET;
-        real radius = 1/(1/offsetRadius - tanhSum/nonOffsetRadius);
+        real radius = RECIP(RECIP(offsetRadius) - tanhSum/nonOffsetRadius);
        real chain = offsetRadius*(alpha - 2*beta*sum + 3*gamma*sum2);
        chain = (1-tanhSum*tanhSum)*chain / nonOffsetRadius;
        bornRadii[index] = radius;
        obcChain[index] = chain;
-        index += get_global_size(0);
    }
 }

@@ -46,21 +44,22 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
 * Reduce the Born force.
 */

-__kernel void reduceBornForce(int bufferSize, int numBuffers, __global real* bornForce,
+KERNEL void reduceBornForce(
 #ifdef SUPPORTS_64_BIT_ATOMICS
-            __global const long* restrict bornForceIn,
+            GLOBAL mm_long* RESTRICT bornForce,
+#else
+            GLOBAL real* bornForce, int bufferSize, int numBuffers,
 #endif
-            __global mixed* restrict energyBuffer, __global const float2* restrict params, __global const real* restrict bornRadii, __global const real* restrict obcChain) {
+            GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const float2* RESTRICT params, GLOBAL const real* RESTRICT bornRadii, GLOBAL const real* RESTRICT obcChain) {
    mixed energy = 0;
-    unsigned int index = get_global_id(0);
-    while (index < NUM_ATOMS) {
-        // Sum the Born force
+    for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
+        // Get summed Born force

-        int totalSize = bufferSize*numBuffers;
 #ifdef SUPPORTS_64_BIT_ATOMICS
-        real force = (1/(real) 0x100000000)*bornForceIn[index];
+        real force = RECIP((real) 0x100000000)*bornForce[index];
 #else
        real force = bornForce[index];
+        int totalSize = bufferSize*numBuffers;
        for (int i = index+bufferSize; i < totalSize; i += bufferSize)
            force += bornForce[i];
 #endif
@@ -69,13 +68,16 @@ __kernel void reduceBornForce(int bufferSize, int numBuffers, __global real* bor
        float offsetRadius = params[index].x;
        real bornRadius = bornRadii[index];
        real r = offsetRadius+DIELECTRIC_OFFSET+PROBE_RADIUS;
-        real ratio6 = pow((offsetRadius+DIELECTRIC_OFFSET)/bornRadius, (real) 6);
+        real ratio6 = POW((offsetRadius+DIELECTRIC_OFFSET)/bornRadius, (real) 6);
        real saTerm = SURFACE_AREA_FACTOR*r*r*ratio6;
        force += saTerm/bornRadius;
        energy += saTerm;
        force *= bornRadius*bornRadius*obcChain[index];
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        bornForce[index] = (mm_long) (force*0x100000000);
+#else
        bornForce[index] = force;
-        index += get_global_size(0);
+#endif
    }
-    energyBuffer[get_global_id(0)] += energy/-6.0f;
+    energyBuffer[GLOBAL_ID] += energy/-6;
 }
\ No newline at end of file
--- a/platforms/opencl/src/kernels/gbsaObc_cpu.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_cpu.cl
-#ifdef SUPPORTS_64_BIT_ATOMICS
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-#endif
-
 typedef struct {
    real x, y, z;
    real q;
@@ -12,27 +8,27 @@ typedef struct {
 /**
 * Compute the Born sum.
 */
-__kernel void computeBornSum(
+KERNEL void computeBornSum(
 #ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict global_bornSum,
+        GLOBAL mm_long* RESTRICT global_bornSum,
 #else
-        __global real* restrict global_bornSum,
+        GLOBAL real* RESTRICT global_bornSum,
 #endif
-        __global const real4* restrict posq, __global const real* restrict charge, __global const float2* restrict global_params,
+        GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
 #ifdef USE_CUTOFF
-        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
-        __global const real4* restrict blockSize, __global const int* restrict interactingAtoms,
+        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
+        GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
 #else
        unsigned int numTiles,
 #endif
-        __global const ushort2* exclusionTiles) {
-    __local AtomData1 localData[TILE_SIZE];
+        GLOBAL const ushort2* exclusionTiles) {
+    LOCAL AtomData1 localData[TILE_SIZE];

    // First loop: process tiles that contain exclusions.
    
-    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
-    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
        const ushort2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
@@ -56,17 +52,17 @@ __kernel void computeBornSum(

            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                unsigned int atom1 = x*TILE_SIZE+tgx;
-                real bornSum = 0.0f;
+                real bornSum = 0;
                real4 posq1 = posq[atom1];
                float2 params1 = global_params[atom1];
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
+                    real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
                    real charge2 = localData[j].q;
-                    real4 delta = (real4) (pos2 - posq1.xyz, 0);
+                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
-                    real r2 = dot(delta.xyz, delta.xyz);
+                    real r2 = dot(trimTo3(delta), trimTo3(delta));
 #ifdef USE_CUTOFF
                    if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
 #else
@@ -74,7 +70,7 @@ __kernel void computeBornSum(
 #endif
                        real invR = RSQRT(r2);
                        real r = r2*invR;
-                        float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
+                        float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
                        real rScaledRadiusJ = r+params2.y;
                        if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
                            real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
@@ -92,9 +88,9 @@ __kernel void computeBornSum(
                // Write results.

 #ifdef SUPPORTS_64_BIT_ATOMICS
-                atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
+                ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
 #else
-                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
                global_bornSum[offset] += bornSum;
 #endif
            }
@@ -110,9 +106,9 @@ __kernel void computeBornSum(
                real4 posq1 = posq[atom1];
                float2 params1 = global_params[atom1];
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
+                    real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
                    real charge2 = localData[j].q;
-                    real4 delta = (real4) (pos2 - posq1.xyz, 0);
+                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -124,7 +120,7 @@ __kernel void computeBornSum(
 #endif
                        real invR = RSQRT(r2);
                        real r = r2*invR;
-                        float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
+                        float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
                        real rScaledRadiusJ = r+params2.y;
                        if (params1.x < rScaledRadiusJ) {
                            real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
@@ -154,9 +150,9 @@ __kernel void computeBornSum(
               // Write results for atom1.

 #ifdef SUPPORTS_64_BIT_ATOMICS
-                atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
+                ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
 #else
-                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
                global_bornSum[offset] += bornSum;
 #endif
            }
@@ -166,9 +162,9 @@ __kernel void computeBornSum(
            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
 #ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = y*TILE_SIZE + tgx;
-                atom_add(&global_bornSum[offset], (long) (localData[tgx].bornSum*0x100000000));
+                ATOMIC_ADD(&global_bornSum[offset], (mm_long) (localData[tgx].bornSum*0x100000000));
 #else
-                unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
                global_bornSum[offset] += localData[tgx].bornSum;
 #endif
            }
@@ -182,15 +178,15 @@ __kernel void computeBornSum(
    unsigned int numTiles = interactionCount[0];
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
-    int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
-    int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
+    int pos = (int) (GROUP_ID*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
+    int end = (int) ((GROUP_ID+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
 #else
-    int pos = (int) (get_group_id(0)*(long)numTiles/get_num_groups(0));
-    int end = (int) ((get_group_id(0)+1)*(long)numTiles/get_num_groups(0));
+    int pos = (int) (GROUP_ID*(mm_long)numTiles/NUM_GROUPS);
+    int end = (int) ((GROUP_ID+1)*(mm_long)numTiles/NUM_GROUPS);
 #endif
    int nextToSkip = -1;
    int currentSkipIndex = 0;
-    __local int atomIndices[TILE_SIZE];
+    LOCAL int atomIndices[TILE_SIZE];

    while (pos < end) {
        bool includeTile = true;
@@ -263,15 +259,15 @@ __kernel void computeBornSum(
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
                    float2 params1 = global_params[atom1];
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
+                        real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
                        real charge2 = localData[j].q;
-                        real4 delta = (real4) (pos2 - posq1.xyz, 0);
+                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        int atom2 = atomIndices[j];
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
                            real invR = RSQRT(r2);
                            real r = r2*invR;
-                            float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
+                            float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
                            real rScaledRadiusJ = r+params2.y;
                            if (params1.x < rScaledRadiusJ) {
                                real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
@@ -301,9 +297,9 @@ __kernel void computeBornSum(
                    // Write results for atom1.

 #ifdef SUPPORTS_64_BIT_ATOMICS
-                    atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
+                    ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
 #else
-                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
                    global_bornSum[offset] += bornSum;
 #endif
                }
@@ -319,9 +315,9 @@ __kernel void computeBornSum(
                    real4 posq1 = posq[atom1];
                    float2 params1 = global_params[atom1];
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
+                        real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
                        real charge2 = localData[j].q;
-                        real4 delta = (real4) (pos2 - posq1.xyz, 0);
+                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
 #ifdef USE_PERIODIC
                        APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -334,7 +330,7 @@ __kernel void computeBornSum(
 #endif
                            real invR = RSQRT(r2);
                            real r = r2*invR;
-                            float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
+                            float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
                            real rScaledRadiusJ = r+params2.y;
                            if (params1.x < rScaledRadiusJ) {
                                real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
@@ -364,9 +360,9 @@ __kernel void computeBornSum(
                    // Write results for atom1.

 #ifdef SUPPORTS_64_BIT_ATOMICS
-                    atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
+                    ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
 #else
-                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
                    global_bornSum[offset] += bornSum;
 #endif
                }
@@ -382,9 +378,9 @@ __kernel void computeBornSum(
 #endif
                if (atom2 < PADDED_NUM_ATOMS) {
 #ifdef SUPPORTS_64_BIT_ATOMICS
-                    atom_add(&global_bornSum[atom2], (long) (localData[tgx].bornSum*0x100000000));
+                    ATOMIC_ADD(&global_bornSum[atom2], (mm_long) (localData[tgx].bornSum*0x100000000));
 #else
-                    unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
                    global_bornSum[offset] += localData[tgx].bornSum;
 #endif
                }
@@ -405,29 +401,29 @@ typedef struct {
 * First part of computing the GBSA interaction.
 */

-__kernel void computeGBSAForce1(
+KERNEL void computeGBSAForce1(
 #ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict forceBuffers, __global long* restrict global_bornForce,
+        GLOBAL mm_long* RESTRICT forceBuffers, GLOBAL mm_long* RESTRICT global_bornForce,
 #else
-        __global real4* restrict forceBuffers, __global real* restrict global_bornForce,
+        GLOBAL real4* RESTRICT forceBuffers, GLOBAL real* RESTRICT global_bornForce,
 #endif
-        __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict charge,
-        __global const real* restrict global_bornRadii, int needEnergy,
+        GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
+        GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
 #ifdef USE_CUTOFF
-        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
-        __global const real4* restrict blockSize, __global const int* restrict interactingAtoms,
+        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
+        GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
 #else
        unsigned int numTiles,
 #endif
-        __global const ushort2* exclusionTiles) {
+        GLOBAL const ushort2* exclusionTiles) {
    mixed energy = 0;
-    __local AtomData2 localData[TILE_SIZE];
+    LOCAL AtomData2 localData[TILE_SIZE];

    // First loop: process tiles that contain exclusions.
    
-    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
-    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
+    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
+    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
        const ushort2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
@@ -449,14 +445,14 @@ __kernel void computeGBSAForce1(

            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                unsigned int atom1 = x*TILE_SIZE+tgx;
-                real4 force = 0;
+                real4 force = make_real4(0);
                real4 posq1 = posq[atom1];
                real charge1 = charge[atom1];
                real bornRadius1 = global_bornRadii[atom1];
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
+                    real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
                    real charge2 = localData[j].q;
-                    real4 delta = (real4) (pos2 - posq1.xyz, 0);
+                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -485,21 +481,23 @@ __kernel void computeGBSAForce1(
                            tempEnergy -= scaledChargeProduct/CUTOFF;
 #endif
                        energy += 0.5f*tempEnergy;
-                        delta.xyz *= dEdR;
-                        force.xyz -= delta.xyz;
+                        delta *= dEdR;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
                    }
                }

                // Write results.

 #ifdef SUPPORTS_64_BIT_ATOMICS
-                atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
-                atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-                atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
-                atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
+                ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
+                ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
+                ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
+                ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
 #else
-                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
-                forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
+                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
+                forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
                global_bornForce[offset] += force.w;
 #endif
            }
@@ -515,14 +513,14 @@ __kernel void computeGBSAForce1(
            }
            for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                unsigned int atom1 = x*TILE_SIZE+tgx;
-                real4 force = 0;
+                real4 force = make_real4(0);
                real4 posq1 = posq[atom1];
                real charge1 = charge[atom1];
                real bornRadius1 = global_bornRadii[atom1];
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
+                    real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
                    real charge2 = localData[j].q;
-                    real4 delta = (real4) (pos2 - posq1.xyz, 0);
+                    real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -550,8 +548,10 @@ __kernel void computeGBSAForce1(
                        tempEnergy -= scaledChargeProduct/CUTOFF;
 #endif
                        energy += tempEnergy;
-                        delta.xyz *= dEdR;
-                        force.xyz -= delta.xyz;
+                        delta *= dEdR;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
                        localData[j].fx += delta.x;
                        localData[j].fy += delta.y;
                        localData[j].fz += delta.z;
@@ -562,13 +562,13 @@ __kernel void computeGBSAForce1(
               // Write results for atom1.

 #ifdef SUPPORTS_64_BIT_ATOMICS
-                atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
-                atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-                atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
-                atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
+                ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
+                ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
+                ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
+                ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
 #else
-                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
-                forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
+                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
+                forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
                global_bornForce[offset] += force.w;
 #endif
            }
@@ -578,12 +578,12 @@ __kernel void computeGBSAForce1(
            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
 #ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = y*TILE_SIZE + tgx;
-                atom_add(&forceBuffers[offset], (long) (localData[tgx].fx*0x100000000));
-                atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[tgx].fy*0x100000000));
-                atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[tgx].fz*0x100000000));
-                atom_add(&global_bornForce[offset], (long) (localData[tgx].fw*0x100000000));
+                ATOMIC_ADD(&forceBuffers[offset], (mm_long) (localData[tgx].fx*0x100000000));
+                ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fy*0x100000000));
+                ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fz*0x100000000));
+                ATOMIC_ADD(&global_bornForce[offset], (mm_long) (localData[tgx].fw*0x100000000));
 #else
-                unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
                real4 f = forceBuffers[offset];
                f.x += localData[tgx].fx;
                f.y += localData[tgx].fy;
@@ -602,15 +602,15 @@ __kernel void computeGBSAForce1(
    unsigned int numTiles = interactionCount[0];
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
-    int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
-    int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
+    int pos = (int) (GROUP_ID*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
+    int end = (int) ((GROUP_ID+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
 #else
-    int pos = (int) (get_group_id(0)*(long)numTiles/get_num_groups(0));
-    int end = (int) ((get_group_id(0)+1)*(long)numTiles/get_num_groups(0));
+    int pos = (int) (GROUP_ID*(mm_long)numTiles/NUM_GROUPS);
+    int end = (int) ((GROUP_ID+1)*(mm_long)numTiles/NUM_GROUPS);
 #endif
    int nextToSkip = -1;
    int currentSkipIndex = 0;
-    __local int atomIndices[TILE_SIZE];
+    LOCAL int atomIndices[TILE_SIZE];

    while (pos < end) {
        bool includeTile = true;
@@ -679,15 +679,15 @@ __kernel void computeGBSAForce1(
                }
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
-                    real4 force = 0;
+                    real4 force = make_real4(0);
                    real4 posq1 = posq[atom1];
                    real charge1 = charge[atom1];
                    APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
                    float bornRadius1 = global_bornRadii[atom1];
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
+                        real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
                        real charge2 = localData[j].q;
-                        real4 delta = (real4) (pos2 - posq1.xyz, 0);
+                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        int atom2 = atomIndices[j];
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
@@ -709,8 +709,10 @@ __kernel void computeGBSAForce1(
                            tempEnergy -= scaledChargeProduct/CUTOFF;
 #endif
                            energy += tempEnergy;
-                            delta.xyz *= dEdR;
-                            force.xyz -= delta.xyz;
+                            delta *= dEdR;
+                            force.x -= delta.x;
+                            force.y -= delta.y;
+                            force.z -= delta.z;
                            localData[j].fx += delta.x;
                            localData[j].fy += delta.y;
                            localData[j].fz += delta.z;
@@ -721,13 +723,13 @@ __kernel void computeGBSAForce1(
                    // Write results for atom1.

 #ifdef SUPPORTS_64_BIT_ATOMICS
-                    atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
-                    atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-                    atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
-                    atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
+                    ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
+                    ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
+                    ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
+                    ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
 #else
-                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
-                    forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
+                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
+                    forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
                    global_bornForce[offset] += force.w;
 #endif
                }
@@ -739,14 +741,14 @@ __kernel void computeGBSAForce1(

                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
-                    real4 force = 0;
+                    real4 force = make_real4(0);
                    real4 posq1 = posq[atom1];
                    real charge1 = charge[atom1];
                    float bornRadius1 = global_bornRadii[atom1];
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z);
+                        real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
                        real charge2 = localData[j].q;
-                        real4 delta = (real4) (pos2 - posq1.xyz, 0);
+                        real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
 #ifdef USE_PERIODIC
                        APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -775,8 +777,10 @@ __kernel void computeGBSAForce1(
                            tempEnergy -= scaledChargeProduct/CUTOFF;
 #endif
                            energy += tempEnergy;
-                            delta.xyz *= dEdR;
-                            force.xyz -= delta.xyz;
+                            delta *= dEdR;
+                            force.x -= delta.x;
+                            force.y -= delta.y;
+                            force.z -= delta.z;
                            localData[j].fx += delta.x;
                            localData[j].fy += delta.y;
                            localData[j].fz += delta.z;
@@ -787,13 +791,13 @@ __kernel void computeGBSAForce1(
                    // Write results for atom1.

 #ifdef SUPPORTS_64_BIT_ATOMICS
-                    atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
-                    atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-                    atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
-                    atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
+                    ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
+                    ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
+                    ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
+                    ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
 #else
-                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
-                    forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
+                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
+                    forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
                    global_bornForce[offset] += force.w;
 #endif
                }
@@ -809,12 +813,12 @@ __kernel void computeGBSAForce1(
 #endif
                if (atom2 < PADDED_NUM_ATOMS) {
 #ifdef SUPPORTS_64_BIT_ATOMICS
-                    atom_add(&forceBuffers[atom2], (long) (localData[tgx].fx*0x100000000));
-                    atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[tgx].fy*0x100000000));
-                    atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[tgx].fz*0x100000000));
-                    atom_add(&global_bornForce[atom2], (long) (localData[tgx].fw*0x100000000));
+                    ATOMIC_ADD(&forceBuffers[atom2], (mm_long) (localData[tgx].fx*0x100000000));
+                    ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fy*0x100000000));
+                    ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fz*0x100000000));
+                    ATOMIC_ADD(&global_bornForce[atom2], (mm_long) (localData[tgx].fw*0x100000000));
 #else
-                    unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
                    real4 f = forceBuffers[offset];
                    f.x += localData[tgx].fx;
                    f.y += localData[tgx].fy;
@@ -827,5 +831,5 @@ __kernel void computeGBSAForce1(
        }
        pos++;
    }
-    energyBuffer[get_global_id(0)] += energy;
+    energyBuffer[GLOBAL_ID] += energy;
 }
--- a/platforms/cuda/src/kernels/harmonicAngleForce.cu
+++ b/platforms/cuda/src/kernels/harmonicAngleForce.cu
--- a/platforms/cuda/src/kernels/harmonicBondForce.cu
+++ b/platforms/cuda/src/kernels/harmonicBondForce.cu
--- a/platforms/cuda/src/kernels/integrationUtilities.cu
+++ b/platforms/cuda/src/kernels/integrationUtilities.cu
 /**
 * Generate random numbers
 */
-extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restrict__ random, uint4* __restrict__ seed) {
-    int index = blockIdx.x*blockDim.x+threadIdx.x;
+KERNEL void generateRandomNumbers(int numValues, GLOBAL float4* RESTRICT random, GLOBAL uint4* RESTRICT seed) {
+    int index = GLOBAL_ID;
    uint4 state = seed[index];
    unsigned int carry = 0;
    while (index < numValues) {
@@ -63,15 +63,15 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
        // Record the values.

        random[index] = value;
-        index += blockDim.x*gridDim.x;
+        index += GLOBAL_SIZE;
    }
-    seed[blockIdx.x*blockDim.x+threadIdx.x] = state;
+    seed[GLOBAL_ID] = state;
 }

 /**
 * Load the position of a particle.
 */
-inline __device__ mixed4 loadPos(const real4* __restrict__ posq, const real4* __restrict__ posqCorrection, int index) {
+inline DEVICE mixed4 loadPos(GLOBAL const real4* RESTRICT posq, GLOBAL const real4* RESTRICT posqCorrection, int index) {
 #ifdef USE_MIXED_PRECISION
    real4 pos1 = posq[index];
    real4 pos2 = posqCorrection[index];
@@ -84,7 +84,7 @@ inline __device__ mixed4 loadPos(const real4* __restrict__ posq, const real4* __
 /**
 * Store the position of a particle.
 */
-inline __device__ void storePos(real4* __restrict__ posq, real4* __restrict__ posqCorrection, int index, mixed4 pos) {
+inline DEVICE void storePos(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT posqCorrection, int index, mixed4 pos) {
 #ifdef USE_MIXED_PRECISION
    posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
    posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
@@ -96,16 +96,24 @@ inline __device__ void storePos(real4* __restrict__ posq, real4* __restrict__ po
 /**
 * Enforce constraints on SHAKE clusters
 */
-extern "C" __global__ void applyShakeToPositions(int numClusters, mixed tol, const real4* __restrict__ oldPos, real4* __restrict__ posCorrection, mixed4* __restrict__ posDelta, const int4* __restrict__ clusterAtoms, const float4* __restrict__ clusterParams) {
-    int index = blockIdx.x*blockDim.x+threadIdx.x;
+KERNEL void applyShakeToPositions(int numClusters, mixed tol, GLOBAL const real4* RESTRICT oldPos,
+        GLOBAL mixed4* RESTRICT posDelta, GLOBAL const int4* RESTRICT clusterAtoms, GLOBAL const float4* RESTRICT clusterParams
+#ifdef USE_MIXED_PRECISION
+        , GLOBAL const real4* RESTRICT posqCorrection
+#endif
+        ) {
+#ifndef USE_MIXED_PRECISION
+        GLOBAL real4* posqCorrection = 0;
+#endif
+    int index = GLOBAL_ID;
    while (index < numClusters) {
        // Load the data for this cluster.

        int4 atoms = clusterAtoms[index];
        float4 params = clusterParams[index];
-        mixed4 pos = loadPos(oldPos, posCorrection, atoms.x);
+        mixed4 pos = loadPos(oldPos, posqCorrection, atoms.x);
        mixed4 xpi = posDelta[atoms.x];
-        mixed4 pos1 = loadPos(oldPos, posCorrection, atoms.y);
+        mixed4 pos1 = loadPos(oldPos, posqCorrection, atoms.y);
        mixed4 xpj1 = posDelta[atoms.y];
        mixed4 pos2 = make_mixed4(0);
        mixed4 xpj2 = make_mixed4(0);
@@ -114,13 +122,13 @@ extern "C" __global__ void applyShakeToPositions(int numClusters, mixed tol, con
        float d2 = params.z;
        float invMassPeripheral = params.w;
        if (atoms.z != -1) {
-            pos2 = loadPos(oldPos, posCorrection, atoms.z);
+            pos2 = loadPos(oldPos, posqCorrection, atoms.z);
            xpj2 = posDelta[atoms.z];
        }
        mixed4 pos3 = make_mixed4(0);
        mixed4 xpj3 = make_mixed4(0);
        if (atoms.w != -1) {
-            pos3 = loadPos(oldPos, posCorrection, atoms.w);
+            pos3 = loadPos(oldPos, posqCorrection, atoms.w);
            xpj3 = posDelta[atoms.w];
        }

@@ -202,23 +210,31 @@ extern "C" __global__ void applyShakeToPositions(int numClusters, mixed tol, con
            posDelta[atoms.z] = xpj2;
        if (atoms.w != -1)
            posDelta[atoms.w] = xpj3;
-        index += blockDim.x*gridDim.x;
+        index += GLOBAL_SIZE;
    }
 }

 /**
 * Enforce velocity constraints on SHAKE clusters
 */
-extern "C" __global__ void applyShakeToVelocities(int numClusters, mixed tol, const real4* __restrict__ oldPos, real4* __restrict__ posCorrection, mixed4* __restrict__ posDelta, const int4* __restrict__ clusterAtoms, const float4* __restrict__ clusterParams) {
-    int index = blockIdx.x*blockDim.x+threadIdx.x;
+KERNEL void applyShakeToVelocities(int numClusters, mixed tol, GLOBAL const real4* RESTRICT oldPos,
+        GLOBAL mixed4* RESTRICT posDelta, GLOBAL const int4* RESTRICT clusterAtoms, GLOBAL const float4* RESTRICT clusterParams
+#ifdef USE_MIXED_PRECISION
+        , GLOBAL const real4* RESTRICT posqCorrection
+#endif
+        ) {
+#ifndef USE_MIXED_PRECISION
+        GLOBAL real4* posqCorrection = 0;
+#endif
+    int index = GLOBAL_ID;
    while (index < numClusters) {
        // Load the data for this cluster.

        int4 atoms = clusterAtoms[index];
        float4 params = clusterParams[index];
-        mixed4 pos = loadPos(oldPos, posCorrection, atoms.x);
+        mixed4 pos = loadPos(oldPos, posqCorrection, atoms.x);
        mixed4 xpi = posDelta[atoms.x];
-        mixed4 pos1 = loadPos(oldPos, posCorrection, atoms.y);
+        mixed4 pos1 = loadPos(oldPos, posqCorrection, atoms.y);
        mixed4 xpj1 = posDelta[atoms.y];
        mixed4 pos2 = make_mixed4(0);
        mixed4 xpj2 = make_mixed4(0);
@@ -226,13 +242,13 @@ extern "C" __global__ void applyShakeToVelocities(int numClusters, mixed tol, co
        float avgMass = params.y;
        float invMassPeripheral = params.w;
        if (atoms.z != -1) {
-            pos2 = loadPos(oldPos, posCorrection, atoms.z);
+            pos2 = loadPos(oldPos, posqCorrection, atoms.z);
            xpj2 = posDelta[atoms.z];
        }
        mixed4 pos3 = make_mixed4(0);
        mixed4 xpj3 = make_mixed4(0);
        if (atoms.w != -1) {
-            pos3 = loadPos(oldPos, posCorrection, atoms.w);
+            pos3 = loadPos(oldPos, posqCorrection, atoms.w);
            xpj3 = posDelta[atoms.w];
        }

@@ -302,25 +318,34 @@ extern "C" __global__ void applyShakeToVelocities(int numClusters, mixed tol, co
            posDelta[atoms.z] = xpj2;
        if (atoms.w != -1)
            posDelta[atoms.w] = xpj3;
-        index += blockDim.x*gridDim.x;
+        index += GLOBAL_SIZE;
    }
 }

 /**
 * Enforce constraints on SETTLE clusters
 */
-extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, const real4* __restrict__ oldPos, real4* __restrict__ posCorrection, mixed4* __restrict__ posDelta, const mixed4* __restrict__ velm, const int4* __restrict__ clusterAtoms, const float2* __restrict__ clusterParams) {
-    int index = blockIdx.x*blockDim.x+threadIdx.x;
+KERNEL void applySettleToPositions(int numClusters, mixed tol, GLOBAL const real4* RESTRICT oldPos,
+        GLOBAL mixed4* RESTRICT posDelta, GLOBAL const mixed4* RESTRICT velm, GLOBAL const int4* RESTRICT clusterAtoms,
+        GLOBAL const float2* RESTRICT clusterParams
+#ifdef USE_MIXED_PRECISION
+        , GLOBAL const real4* RESTRICT posqCorrection
+#endif
+        ) {
+#ifndef USE_MIXED_PRECISION
+        GLOBAL real4* posqCorrection = 0;
+#endif
+    int index = GLOBAL_ID;
    while (index < numClusters) {
        // Load the data for this cluster.

        int4 atoms = clusterAtoms[index];
        float2 params = clusterParams[index];
-        mixed4 apos0 = loadPos(oldPos, posCorrection, atoms.x);
+        mixed4 apos0 = loadPos(oldPos, posqCorrection, atoms.x);
        mixed4 xp0 = posDelta[atoms.x];
-        mixed4 apos1 = loadPos(oldPos, posCorrection, atoms.y);
+        mixed4 apos1 = loadPos(oldPos, posqCorrection, atoms.y);
        mixed4 xp1 = posDelta[atoms.y];
-        mixed4 apos2 = loadPos(oldPos, posCorrection, atoms.z);
+        mixed4 apos2 = loadPos(oldPos, posqCorrection, atoms.z);
        mixed4 xp2 = posDelta[atoms.z];
        mixed m0 = 1/velm[atoms.x].w;
        mixed m1 = 1/velm[atoms.y].w;
@@ -454,21 +479,30 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
        posDelta[atoms.x] = xp0;
        posDelta[atoms.y] = xp1;
        posDelta[atoms.z] = xp2;
-        index += blockDim.x*gridDim.x;
+        index += GLOBAL_SIZE;
    }
 }

 /**
 * Enforce velocity constraints on SETTLE clusters
 */
-extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, const real4* __restrict__ oldPos, real4* __restrict__ posCorrection, mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const int4* __restrict__ clusterAtoms, const float2* __restrict__ clusterParams) {
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numClusters; index += blockDim.x*gridDim.x) {
+KERNEL void applySettleToVelocities(int numClusters, mixed tol, GLOBAL const real4* RESTRICT oldPos,
+        GLOBAL mixed4* RESTRICT posDelta, GLOBAL mixed4* RESTRICT velm, GLOBAL const int4* RESTRICT clusterAtoms,
+        GLOBAL const float2* RESTRICT clusterParams
+#ifdef USE_MIXED_PRECISION
+        , GLOBAL const real4* RESTRICT posqCorrection
+#endif
+        ) {
+#ifndef USE_MIXED_PRECISION
+        GLOBAL real4* posqCorrection = 0;
+#endif
+    for (int index = GLOBAL_ID; index < numClusters; index += GLOBAL_SIZE) {
        // Load the data for this cluster.

        int4 atoms = clusterAtoms[index];
-        mixed4 apos0 = loadPos(oldPos, posCorrection, atoms.x);
-        mixed4 apos1 = loadPos(oldPos, posCorrection, atoms.y);
-        mixed4 apos2 = loadPos(oldPos, posCorrection, atoms.z);
+        mixed4 apos0 = loadPos(oldPos, posqCorrection, atoms.x);
+        mixed4 apos1 = loadPos(oldPos, posqCorrection, atoms.y);
+        mixed4 apos2 = loadPos(oldPos, posqCorrection, atoms.z);
        mixed4 v0 = velm[atoms.x];
        mixed4 v1 = velm[atoms.y];
        mixed4 v2 = velm[atoms.z];
@@ -522,9 +556,16 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
 /**
 * Compute the direction each CCMA constraint is pointing in.  This is called once at the beginning of constraint evaluation.
 */
-extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restrict__ constraintAtoms, mixed4* __restrict__ constraintDistance,
-        const real4* __restrict__ atomPositions, const real4* __restrict__ posqCorrection, int* __restrict__ converged) {
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
+KERNEL void computeCCMAConstraintDirections(GLOBAL const int2* RESTRICT constraintAtoms, GLOBAL mixed4* RESTRICT constraintDistance,
+        GLOBAL const real4* RESTRICT atomPositions, GLOBAL int* RESTRICT converged
+#ifdef USE_MIXED_PRECISION
+        , GLOBAL const real4* RESTRICT posqCorrection
+#endif
+        ) {
+#ifndef USE_MIXED_PRECISION
+        GLOBAL real4* posqCorrection = 0;
+#endif
+    for (int index = GLOBAL_ID; index < NUM_CCMA_CONSTRAINTS; index += GLOBAL_SIZE) {
        // Compute the direction for this constraint.

        int2 atoms = constraintAtoms[index];
@@ -536,7 +577,7 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
        dir.z = oldPos1.z-oldPos2.z;
        constraintDistance[index] = dir;
    }
-    if (threadIdx.x == 0 && blockIdx.x == 0) {
+    if (GLOBAL_ID == 0) {
        converged[0] = 1;
        converged[1] = 0;
    }
@@ -545,23 +586,24 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
 /**
 * Compute the force applied by each CCMA position constraint.
 */
-extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __restrict__ constraintAtoms, const mixed4* __restrict__ constraintDistance, const mixed4* __restrict__ atomPositions,
-        const mixed* __restrict__ reducedMass, mixed* __restrict__ delta1, int* __restrict__ converged, int* __restrict__ hostConvergedFlag, mixed tol, int iteration) {
-    __shared__ int groupConverged;
+KERNEL void computeCCMAPositionConstraintForce(GLOBAL const int2* RESTRICT constraintAtoms, GLOBAL const mixed4* RESTRICT constraintDistance,
+        GLOBAL const mixed4* RESTRICT atomPositions, GLOBAL const mixed* RESTRICT reducedMass, GLOBAL mixed* RESTRICT delta1,
+        GLOBAL int* RESTRICT converged, GLOBAL int* RESTRICT hostConvergedFlag, mixed tol, int iteration) {
+    LOCAL int groupConverged;
    if (converged[1-iteration%2]) {
-        if (blockIdx.x == 0 && threadIdx.x == 0) {
+        if (GLOBAL_ID == 0) {
            converged[iteration%2] = 1;
            hostConvergedFlag[0] = 1;
        }
        return; // The constraint iteration has already converged.
    }
-    if (threadIdx.x == 0)
+    if (LOCAL_ID == 0)
        groupConverged = 1;
-    __syncthreads();
+    SYNC_THREADS;
    mixed lowerTol = 1-2*tol+tol*tol;
    mixed upperTol = 1+2*tol+tol*tol;
    bool threadConverged = true;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < NUM_CCMA_CONSTRAINTS; index += GLOBAL_SIZE) {
        // Compute the force due to this constraint.

        int2 atoms = constraintAtoms[index];
@@ -580,28 +622,29 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
    }
    if (groupConverged && !threadConverged)
        groupConverged = 0;
-    __syncthreads();
-    if (threadIdx.x == 0 && !groupConverged)
+    SYNC_THREADS;
+    if (LOCAL_ID == 0 && !groupConverged)
        converged[iteration%2] = 0;
 }

 /**
 * Compute the force applied by each CCMA velocity constraint.
 */
-extern "C" __global__ void computeCCMAVelocityConstraintForce(const int2* __restrict__ constraintAtoms, const mixed4* __restrict__ constraintDistance, const mixed4* __restrict__ atomPositions,
-        const mixed* __restrict__ reducedMass, mixed* __restrict__ delta1, int* __restrict__ converged, int* __restrict__ hostConvergedFlag, mixed tol, int iteration) {
-    __shared__ int groupConverged;
+KERNEL void computeCCMAVelocityConstraintForce(GLOBAL const int2* RESTRICT constraintAtoms, GLOBAL const mixed4* RESTRICT constraintDistance,
+        GLOBAL const mixed4* RESTRICT atomPositions, GLOBAL const mixed* RESTRICT reducedMass, GLOBAL mixed* RESTRICT delta1,
+        GLOBAL int* RESTRICT converged, GLOBAL int* RESTRICT hostConvergedFlag, mixed tol, int iteration) {
+    LOCAL int groupConverged;
    if (converged[1-iteration%2]) {
-        if (blockIdx.x == 0 && threadIdx.x == 0) {
+        if (GROUP_ID == 0 && LOCAL_ID == 0) {
            converged[iteration%2] = 1;
            hostConvergedFlag[0] = 1;
        }
        return; // The constraint iteration has already converged.
    }
-    if (threadIdx.x == 0)
+    if (LOCAL_ID == 0)
        groupConverged = 1;
-    __syncthreads();
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
+    SYNC_THREADS;
+    for (int index = GLOBAL_ID; index < NUM_CCMA_CONSTRAINTS; index += GLOBAL_SIZE) {
        // Compute the force due to this constraint.

        int2 atoms = constraintAtoms[index];
@@ -623,14 +666,14 @@ extern "C" __global__ void computeCCMAVelocityConstraintForce(const int2* __rest
 /**
 * Multiply the vector of CCMA constraint forces by the constraint matrix.
 */
-extern "C" __global__ void multiplyByCCMAConstraintMatrix(const mixed* __restrict__ delta1, mixed* __restrict__ delta2, const int* __restrict__ constraintMatrixColumn,
-        const mixed* __restrict__ constraintMatrixValue, const int* __restrict__ converged, int iteration) {
+KERNEL void multiplyByCCMAConstraintMatrix(GLOBAL const mixed* RESTRICT delta1, GLOBAL mixed* RESTRICT delta2, GLOBAL const int* RESTRICT constraintMatrixColumn,
+        GLOBAL const mixed* RESTRICT constraintMatrixValue, GLOBAL const int* RESTRICT converged, int iteration) {
    if (converged[iteration%2])
        return; // The constraint iteration has already converged.

    // Multiply by the inverse constraint matrix.

-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < NUM_CCMA_CONSTRAINTS; index += GLOBAL_SIZE) {
        mixed sum = 0;
        for (int i = 0; ; i++) {
            int element = index+i*NUM_CCMA_CONSTRAINTS;
@@ -646,14 +689,15 @@ extern "C" __global__ void multiplyByCCMAConstraintMatrix(const mixed* __restric
 /**
 * Update the atom positions based on CCMA constraint forces.
 */
-extern "C" __global__ void updateCCMAAtomPositions(const int* __restrict__ numAtomConstraints, const int* __restrict__ atomConstraints, const mixed4* __restrict__ constraintDistance,
-        mixed4* __restrict__ atomPositions, const mixed4* __restrict__ velm, const mixed* __restrict__ delta1, const mixed* __restrict__ delta2, int* __restrict__ converged, int iteration) {
-    if (blockIdx.x == 0 && threadIdx.x == 0)
+KERNEL void updateCCMAAtomPositions(GLOBAL const int* RESTRICT numAtomConstraints, GLOBAL const int* RESTRICT atomConstraints,
+        GLOBAL const mixed4* RESTRICT constraintDistance, GLOBAL mixed4* RESTRICT atomPositions, GLOBAL const mixed4* RESTRICT velm,
+        GLOBAL const mixed* RESTRICT delta1, GLOBAL const mixed* RESTRICT delta2, GLOBAL int* RESTRICT converged, int iteration) {
+    if (GROUP_ID == 0 && LOCAL_ID == 0)
        converged[1-iteration%2] = 1;
    if (converged[iteration%2])
        return; // The constraint iteration has already converged.
    mixed damping = (iteration < 2 ? 0.5f : 1.0f);
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
        // Compute the new position of this atom.

        mixed4 atomPos = atomPositions[index];
@@ -677,16 +721,17 @@ extern "C" __global__ void updateCCMAAtomPositions(const int* __restrict__ numAt
 /**
 * Compute the positions of virtual sites
 */
-extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const int4* __restrict__ avg2Atoms, const real2* __restrict__ avg2Weights,
-        const int4* __restrict__ avg3Atoms, const real4* __restrict__ avg3Weights,
-        const int4* __restrict__ outOfPlaneAtoms, const real4* __restrict__ outOfPlaneWeights,
-        const int* __restrict__ localCoordsIndex, const int* __restrict__ localCoordsAtoms,
-        const real* __restrict__ localCoordsWeights, const real4* __restrict__ localCoordsPos,
-        const int* __restrict__ localCoordsStartIndex) {
+KERNEL void computeVirtualSites(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT posqCorrection,
+        GLOBAL const int4* RESTRICT avg2Atoms, GLOBAL const real2* RESTRICT avg2Weights,
+        GLOBAL const int4* RESTRICT avg3Atoms, GLOBAL const real4* RESTRICT avg3Weights,
+        GLOBAL const int4* RESTRICT outOfPlaneAtoms, GLOBAL const real4* RESTRICT outOfPlaneWeights,
+        GLOBAL const int* RESTRICT localCoordsIndex, GLOBAL const int* RESTRICT localCoordsAtoms,
+        GLOBAL const real* RESTRICT localCoordsWeights, GLOBAL const real4* RESTRICT localCoordsPos,
+        GLOBAL const int* RESTRICT localCoordsStartIndex) {
    
    // Two particle average sites.
    
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_2_AVERAGE; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < NUM_2_AVERAGE; index += GLOBAL_SIZE) {
        int4 atoms = avg2Atoms[index];
        real2 weights = avg2Weights[index];
        mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
@@ -700,7 +745,7 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
    
    // Three particle average sites.
    
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_3_AVERAGE; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < NUM_3_AVERAGE; index += GLOBAL_SIZE) {
        int4 atoms = avg3Atoms[index];
        real4 weights = avg3Weights[index];
        mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
@@ -715,7 +760,7 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
    
    // Out of plane sites.
    
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_OUT_OF_PLANE; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < NUM_OUT_OF_PLANE; index += GLOBAL_SIZE) {
        int4 atoms = outOfPlaneAtoms[index];
        real4 weights = outOfPlaneWeights[index];
        mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
@@ -724,7 +769,7 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
        mixed4 pos3 = loadPos(posq, posqCorrection, atoms.w);
        mixed4 v12 = pos2-pos1;
        mixed4 v13 = pos3-pos1;
-        mixed3 cr = cross(v12, v13);
+        mixed4 cr = cross(v12, v13);
        pos.x = pos1.x + v12.x*weights.x + v13.x*weights.y + cr.x*weights.z;
        pos.y = pos1.y + v12.y*weights.x + v13.y*weights.y + cr.y*weights.z;
        pos.z = pos1.z + v12.z*weights.x + v13.z*weights.y + cr.z*weights.z;
@@ -733,7 +778,7 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
    
    // Local coordinates sites.
    
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_LOCAL_COORDS; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < NUM_LOCAL_COORDS; index += GLOBAL_SIZE) {
        int siteAtomIndex = localCoordsIndex[index];
        int start = localCoordsStartIndex[index];
        int end = localCoordsStartIndex[index+1];
@@ -762,32 +807,38 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
    }
 }

-inline __device__ real3 loadForce(int index, long long* __restrict__ force) {
+inline DEVICE real3 loadForce(int index, GLOBAL const mm_long* RESTRICT force) {
    real scale = 1/((real) 0x100000000);
    return make_real3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
 }

-inline __device__ void addForce(int index, long long* __restrict__ force, real3 value) {
-    unsigned long long* f = (unsigned long long*) force;
-    atomicAdd(&f[index], static_cast<unsigned long long>((long long) (value.x*0x100000000)));
-    atomicAdd(&f[index+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (value.y*0x100000000)));
-    atomicAdd(&f[index+PADDED_NUM_ATOMS*2], static_cast<unsigned long long>((long long) (value.z*0x100000000)));
+inline DEVICE void addForce(int index, GLOBAL mm_long* RESTRICT force, real3 value) {
+    GLOBAL mm_ulong* f = (GLOBAL mm_ulong*) force;
+#ifdef HAS_OVERLAPPING_VSITES
+    ATOMIC_ADD(&f[index], (mm_ulong) ((mm_long) (value.x*0x100000000)));
+    ATOMIC_ADD(&f[index+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (value.y*0x100000000)));
+    ATOMIC_ADD(&f[index+PADDED_NUM_ATOMS*2], (mm_ulong) ((mm_long) (value.z*0x100000000)));
+#else
+    f[index] += (mm_ulong) ((mm_long) (value.x*0x100000000));
+    f[index+PADDED_NUM_ATOMS] += (mm_ulong) ((mm_long) (value.y*0x100000000));
+    f[index+PADDED_NUM_ATOMS*2] += (mm_ulong) ((mm_long) (value.z*0x100000000));
+#endif
 }

 /**
 * Distribute forces from virtual sites to the atoms they are based on.
 */
-extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__ posq, const real4* __restrict__ posqCorrection, long long* __restrict__ force,
-        const int4* __restrict__ avg2Atoms, const real2* __restrict__ avg2Weights,
-        const int4* __restrict__ avg3Atoms, const real4* __restrict__ avg3Weights,
-        const int4* __restrict__ outOfPlaneAtoms, const real4* __restrict__ outOfPlaneWeights,
-        const int* __restrict__ localCoordsIndex, const int* __restrict__ localCoordsAtoms,
-        const real* __restrict__ localCoordsWeights, const real4* __restrict__ localCoordsPos,
-        const int* __restrict__ localCoordsStartIndex) {
+KERNEL void distributeVirtualSiteForces(GLOBAL const real4* RESTRICT posq, GLOBAL const real4* RESTRICT posqCorrection, GLOBAL mm_long* RESTRICT force,
+        GLOBAL const int4* RESTRICT avg2Atoms, GLOBAL const real2* RESTRICT avg2Weights,
+        GLOBAL const int4* RESTRICT avg3Atoms, GLOBAL const real4* RESTRICT avg3Weights,
+        GLOBAL const int4* RESTRICT outOfPlaneAtoms, GLOBAL const real4* RESTRICT outOfPlaneWeights,
+        GLOBAL const int* RESTRICT localCoordsIndex, GLOBAL const int* RESTRICT localCoordsAtoms,
+        GLOBAL const real* RESTRICT localCoordsWeights, GLOBAL const real4* RESTRICT localCoordsPos,
+        GLOBAL const int* RESTRICT localCoordsStartIndex) {
    
    // Two particle average sites.
    
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_2_AVERAGE; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < NUM_2_AVERAGE; index += GLOBAL_SIZE) {
        int4 atoms = avg2Atoms[index];
        real2 weights = avg2Weights[index];
        real3 f = loadForce(atoms.x, force);
@@ -797,7 +848,7 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
    
    // Three particle average sites.
    
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_3_AVERAGE; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < NUM_3_AVERAGE; index += GLOBAL_SIZE) {
        int4 atoms = avg3Atoms[index];
        real4 weights = avg3Weights[index];
        real3 f = loadForce(atoms.x, force);
@@ -808,7 +859,7 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
    
    // Out of plane sites.
    
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_OUT_OF_PLANE; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < NUM_OUT_OF_PLANE; index += GLOBAL_SIZE) {
        int4 atoms = outOfPlaneAtoms[index];
        real4 weights = outOfPlaneWeights[index];
        mixed4 pos1 = loadPos(posq, posqCorrection, atoms.y);
@@ -830,7 +881,7 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
    
    // Local coordinates sites.
    
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_LOCAL_COORDS; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < NUM_LOCAL_COORDS; index += GLOBAL_SIZE) {
        int siteAtomIndex = localCoordsIndex[index];
        int start = localCoordsStartIndex[index];
        int end = localCoordsStartIndex[index+1];
@@ -884,12 +935,22 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
    }
 }

+/**
+ * Copy the distributed forces from the long buffer back to the float buffer.
+ */
+KERNEL void saveDistributedForces(GLOBAL const mm_long* RESTRICT longForces, GLOBAL real4* RESTRICT forces) {
+    for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
+        real3 f = loadForce(index, longForces);
+        forces[index] = make_real4(f.x, f.y, f.z, 0);
+    }
+}
+
 /**
 * Apply a time shift to the velocities before computing kinetic energy.
 */
-extern "C" __global__ void timeShiftVelocities(mixed4* __restrict__ velm, const long long* __restrict__ force, real timeShift) {
+KERNEL void timeShiftVelocities(GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, real timeShift) {
    const mixed scale = timeShift/(mixed) 0x100000000;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
            velocity.x += scale*force[index]*velocity.w;

--- a/platforms/cuda/src/kernels/langevin.cu
+++ b/platforms/cuda/src/kernels/langevin.cu
@@ -4,13 +4,13 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams};
 * Perform the first step of Langevin integration.
 */

-extern "C" __global__ void integrateLangevinPart1(int numAtoms, int paddedNumAtoms, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
-        const mixed* __restrict__ paramBuffer, const mixed2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) {
+KERNEL void integrateLangevinPart1(int numAtoms, int paddedNumAtoms, GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL mixed4* RESTRICT posDelta,
+        GLOBAL const mixed* RESTRICT paramBuffer, GLOBAL const mixed2* RESTRICT dt, GLOBAL const float4* RESTRICT random, unsigned int randomIndex) {
    mixed vscale = paramBuffer[VelScale];
    mixed fscale = paramBuffer[ForceScale]/(mixed) 0x100000000;
    mixed noisescale = paramBuffer[NoiseScale];
    mixed stepSize = dt[0].y;
-    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    int index = GLOBAL_ID;
    randomIndex += index;
    while (index < numAtoms) {
        mixed4 velocity = velm[index];
@@ -22,8 +22,8 @@ extern "C" __global__ void integrateLangevinPart1(int numAtoms, int paddedNumAto
            velm[index] = velocity;
            posDelta[index] = make_mixed4(stepSize*velocity.x, stepSize*velocity.y, stepSize*velocity.z, 0);
        }
-        randomIndex += blockDim.x*gridDim.x;
-        index += blockDim.x*gridDim.x;
+        randomIndex += GLOBAL_SIZE;
+        index += GLOBAL_SIZE;
    }
 }

@@ -31,14 +31,18 @@ extern "C" __global__ void integrateLangevinPart1(int numAtoms, int paddedNumAto
 * Perform the second step of Langevin integration.
 */

-extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) {
-#if __CUDA_ARCH__ >= 130
+KERNEL void integrateLangevinPart2(int numAtoms, GLOBAL real4* RESTRICT posq, GLOBAL const mixed4* RESTRICT posDelta, GLOBAL mixed4* RESTRICT velm, GLOBAL const mixed2* RESTRICT dt
+#ifdef USE_MIXED_PRECISION
+        , GLOBAL real4* RESTRICT posqCorrection
+#endif
+        ) {
+#ifdef SUPPORTS_DOUBLE_PRECISION
    double invStepSize = 1.0/dt[0].y;
 #else
    float invStepSize = 1.0f/dt[0].y;
    float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
 #endif
-    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    int index = GLOBAL_ID;
    while (index < numAtoms) {
        mixed4 vel = velm[index];
        if (vel.w != 0) {
@@ -53,7 +57,7 @@ extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restric
            pos.x += delta.x;
            pos.y += delta.y;
            pos.z += delta.z;
-#if __CUDA_ARCH__ >= 130
+#ifdef SUPPORTS_DOUBLE_PRECISION
            vel.x = (mixed) (invStepSize*delta.x);
            vel.y = (mixed) (invStepSize*delta.y);
            vel.z = (mixed) (invStepSize*delta.z);
@@ -70,7 +74,7 @@ extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restric
 #endif
            velm[index] = vel;
        }
-        index += blockDim.x*gridDim.x;
+        index += GLOBAL_SIZE;
    }
 }

@@ -78,32 +82,30 @@ extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restric
 * Select the step size to use for the next step.
 */

-extern "C" __global__ void selectLangevinStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed friction, mixed kT, mixed2* __restrict__ dt,
-        const mixed4* __restrict__ velm, const long long* __restrict__ force, mixed* __restrict__ paramBuffer) {
+KERNEL void selectLangevinStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed friction, mixed kT, GLOBAL mixed2* RESTRICT dt,
+        GLOBAL const mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL mixed* RESTRICT paramBuffer) {
    // Calculate the error.

-    extern __shared__ mixed params[];
-    mixed* error = &params[MaxParams];
+    LOCAL mixed error[256];
+    LOCAL mixed params[MaxParams];
    mixed err = 0;
-    unsigned int index = threadIdx.x;
    const mixed scale = RECIP((mixed) 0x100000000);
-    while (index < numAtoms) {
+    for (int index = LOCAL_ID; index < numAtoms; index += LOCAL_SIZE) {
        mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]);
        mixed invMass = velm[index].w;
        err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass*invMass;
-        index += blockDim.x*gridDim.x;
    }
-    error[threadIdx.x] = err;
-    __syncthreads();
+    error[LOCAL_ID] = err;
+    SYNC_THREADS;

    // Sum the errors from all threads.

-    for (unsigned int offset = 1; offset < blockDim.x; offset *= 2) {
-        if (threadIdx.x+offset < blockDim.x && (threadIdx.x&(2*offset-1)) == 0)
-            error[threadIdx.x] += error[threadIdx.x+offset];
-        __syncthreads();
+    for (unsigned int offset = 1; offset < LOCAL_SIZE; offset *= 2) {
+        if (LOCAL_ID+offset < LOCAL_SIZE && (LOCAL_ID&(2*offset-1)) == 0)
+            error[LOCAL_ID] += error[LOCAL_ID+offset];
+        SYNC_THREADS;
    }
-    if (blockIdx.x*blockDim.x+threadIdx.x == 0) {
+    if (GLOBAL_ID == 0) {
        // Select the new step size.

        mixed totalError = SQRT(error[0]/(numAtoms*3));
@@ -126,7 +128,7 @@ extern "C" __global__ void selectLangevinStepSize(int numAtoms, int paddedNumAto
        params[ForceScale] = fscale;
        params[NoiseScale] = noisescale;
    }
-    __syncthreads();
-    if (threadIdx.x < MaxParams)
-        paramBuffer[threadIdx.x] = params[threadIdx.x];
+    SYNC_THREADS;
+    if (LOCAL_ID < MaxParams)
+        paramBuffer[LOCAL_ID] = params[LOCAL_ID];
 }
--- a/platforms/cuda/src/kernels/baoab.cu
+++ b/platforms/cuda/src/kernels/baoab.cu
 enum {VelScale, NoiseScale};

 /**
- * Perform the first part of BAOAB integration: velocity half step, then position half step.
+ * Perform the first part of integration: velocity step.
 */

-extern "C" __global__ void integrateBAOABPart1(int numAtoms, int paddedNumAtoms, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
-        mixed4* __restrict__ oldDelta, const mixed2* __restrict__ dt) {
-    mixed halfdt = 0.5*dt[0].y;
-    mixed fscale = halfdt/(mixed) 0x100000000;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+KERNEL void integrateLangevinMiddlePart1(int numAtoms, int paddedNumAtoms, GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force,
+        GLOBAL const mixed2* RESTRICT dt) {
+    mixed fscale = dt[0].y/(mixed) 0x100000000;
+    for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
            velocity.x += fscale*velocity.w*force[index];
            velocity.y += fscale*velocity.w*force[index+paddedNumAtoms];
            velocity.z += fscale*velocity.w*force[index+paddedNumAtoms*2];
            velm[index] = velocity;
-            mixed4 delta = make_mixed4(halfdt*velocity.x, halfdt*velocity.y, halfdt*velocity.z, 0);
-            posDelta[index] = delta;
-            oldDelta[index] = delta;
        }
    }
 }

 /**
- * Perform the second part of BAOAB integration: apply constraint forces to velocities, then interact with heat bath,
- * then position half step.
+ * Perform the second part of integration: position half step, then interact with heat bath,
+ * then another position half step.
 */

-extern "C" __global__ void integrateBAOABPart2(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, mixed4* __restrict__ posDelta,
-        mixed4* __restrict__ oldDelta, const mixed* __restrict__ paramBuffer, const mixed2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) {
+KERNEL void integrateLangevinMiddlePart2(int numAtoms, GLOBAL mixed4* RESTRICT velm, GLOBAL mixed4* RESTRICT posDelta,
+        GLOBAL mixed4* RESTRICT oldDelta, GLOBAL const mixed* RESTRICT paramBuffer, GLOBAL const mixed2* RESTRICT dt, GLOBAL const float4* RESTRICT random, unsigned int randomIndex
+        ) {
    mixed vscale = paramBuffer[VelScale];
    mixed noisescale = paramBuffer[NoiseScale];
    mixed halfdt = 0.5*dt[0].y;
-    mixed invHalfdt = 1/halfdt;
-    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    int index = GLOBAL_ID;
    randomIndex += index;
    while (index < numAtoms) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
-            mixed4 delta = posDelta[index];
+            mixed4 delta = make_mixed4(halfdt*velocity.x, halfdt*velocity.y, halfdt*velocity.z, 0);
            mixed sqrtInvMass = SQRT(velocity.w);
-            velocity.x += (delta.x-oldDelta[index].x)*invHalfdt;
-            velocity.y += (delta.y-oldDelta[index].y)*invHalfdt;
-            velocity.z += (delta.z-oldDelta[index].z)*invHalfdt;
            velocity.x = vscale*velocity.x + noisescale*sqrtInvMass*random[randomIndex].x;
            velocity.y = vscale*velocity.y + noisescale*sqrtInvMass*random[randomIndex].y;
            velocity.z = vscale*velocity.z + noisescale*sqrtInvMass*random[randomIndex].z;
            velm[index] = velocity;
-#ifdef USE_MIXED_PRECISION
-            real4 pos1 = posq[index];
-            real4 pos2 = posqCorrection[index];
-            mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-#else
-            real4 pos = posq[index];
-#endif
-            pos.x += delta.x;
-            pos.y += delta.y;
-            pos.z += delta.z;
-#ifdef USE_MIXED_PRECISION
-            posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
-            posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
-#else
-            posq[index] = pos;
-#endif
-            delta = make_mixed4(halfdt*velocity.x, halfdt*velocity.y, halfdt*velocity.z, 0);
+            delta += make_mixed4(halfdt*velocity.x, halfdt*velocity.y, halfdt*velocity.z, 0);
            posDelta[index] = delta;
            oldDelta[index] = delta;
        }
-        randomIndex += blockDim.x*gridDim.x;
-        index += blockDim.x*gridDim.x;
+        randomIndex += GLOBAL_SIZE;
+        index += GLOBAL_SIZE;
    }
 }

 /**
- * Perform the third part of BAOAB integration: apply constraint forces to velocities, then record
- * the constrained positions in preparation for computing forces.
+ * Perform the third part of integration: apply constraint forces to velocities, then record
+ * the constrained positions.
 */

-extern "C" __global__ void integrateBAOABPart3(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ velm,
-        mixed4* __restrict__ posDelta, mixed4* __restrict__ oldDelta, const mixed2* __restrict__ dt) {
-    mixed halfdt = 0.5*dt[0].y;
-    mixed invHalfdt = 1/halfdt;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+KERNEL void integrateLangevinMiddlePart3(int numAtoms, GLOBAL real4* RESTRICT posq, GLOBAL mixed4* RESTRICT velm,
+         GLOBAL mixed4* RESTRICT posDelta, GLOBAL mixed4* RESTRICT oldDelta, GLOBAL const mixed2* RESTRICT dt
+#ifdef USE_MIXED_PRECISION
+        , GLOBAL real4* RESTRICT posqCorrection
+#endif
+        ) {
+    mixed invDt = 1/dt[0].y;
+    for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
            mixed4 delta = posDelta[index];
-            velocity.x += (delta.x-oldDelta[index].x)*invHalfdt;
-            velocity.y += (delta.y-oldDelta[index].y)*invHalfdt;
-            velocity.z += (delta.z-oldDelta[index].z)*invHalfdt;
+            velocity.x += (delta.x-oldDelta[index].x)*invDt;
+            velocity.y += (delta.y-oldDelta[index].y)*invDt;
+            velocity.z += (delta.z-oldDelta[index].z)*invDt;
            velm[index] = velocity;
 #ifdef USE_MIXED_PRECISION
            real4 pos1 = posq[index];
@@ -108,22 +88,3 @@ extern "C" __global__ void integrateBAOABPart3(int numAtoms, real4* __restrict__
        }
    }
 }
-
-/**
- * Perform the fourth part of BAOAB integration: velocity half step.
- */
-
-extern "C" __global__ void integrateBAOABPart4(int numAtoms, int paddedNumAtoms, mixed4* __restrict__ velm,
-        const long long* __restrict__ force, const mixed2* __restrict__ dt) {
-    mixed halfdt = 0.5*dt[0].y;
-    mixed fscale = halfdt/(mixed) 0x100000000;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
-        mixed4 velocity = velm[index];
-        if (velocity.w != 0.0) {
-            velocity.x += fscale*velocity.w*force[index];
-            velocity.y += fscale*velocity.w*force[index+paddedNumAtoms];
-            velocity.z += fscale*velocity.w*force[index+paddedNumAtoms*2];
-            velm[index] = velocity;
-        }
-    }
-}
--- a/platforms/cuda/src/kernels/periodicTorsionForce.cu
+++ b/platforms/cuda/src/kernels/periodicTorsionForce.cu
--- a/platforms/cuda/src/kernels/rbTorsionForce.cu
+++ b/platforms/cuda/src/kernels/rbTorsionForce.cu
--- a/platforms/opencl/src/kernels/removeCM.cl
+++ b/platforms/opencl/src/kernels/removeCM.cl
@@ -2,111 +2,79 @@
 * Calculate the center of mass momentum.
 */

-__kernel void calcCenterOfMassMomentum(int numAtoms, __global const mixed4* restrict velm, __global float4* restrict cmMomentum, __local volatile float4* restrict temp) {
-    int index = get_global_id(0);
-    float4 cm = 0.0f;
-    while (index < numAtoms) {
+KERNEL void calcCenterOfMassMomentum(int numAtoms, GLOBAL const mixed4* RESTRICT velm, GLOBAL float3* RESTRICT cmMomentum) {
+    LOCAL float3 temp[64];
+    float3 cm = make_float3(0, 0, 0);
+    for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0) {
-            cm.x += velocity.x/velocity.w;
-            cm.y += velocity.y/velocity.w;
-            cm.z += velocity.z/velocity.w;
+            mixed mass = RECIP(velocity.w);
+            cm.x += (float) (velocity.x*mass);
+            cm.y += (float) (velocity.y*mass);
+            cm.z += (float) (velocity.z*mass);
        }
-        index += get_global_size(0);
    }

    // Sum the threads in this group.

-    int thread = get_local_id(0);
+    int thread = LOCAL_ID;
    temp[thread] = cm;
-    barrier(CLK_LOCAL_MEM_FENCE);
-#ifdef WARPS_ARE_ATOMIC
-    if (thread < 32) {
-        temp[thread] += temp[thread+32];
-        if (thread < 16)
-            temp[thread] += temp[thread+16];
-        if (thread < 8)
-            temp[thread] += temp[thread+8];
-        if (thread < 4)
-            temp[thread] += temp[thread+4];
-        if (thread < 2)
-            temp[thread] += temp[thread+2];
-    }
-#else
+    SYNC_THREADS;
    if (thread < 32)
        temp[thread] += temp[thread+32];
-    barrier(CLK_LOCAL_MEM_FENCE);
+    SYNC_THREADS;
    if (thread < 16)
        temp[thread] += temp[thread+16];
-    barrier(CLK_LOCAL_MEM_FENCE);
+    SYNC_THREADS;
    if (thread < 8)
        temp[thread] += temp[thread+8];
-    barrier(CLK_LOCAL_MEM_FENCE);
+    SYNC_THREADS;
    if (thread < 4)
        temp[thread] += temp[thread+4];
-    barrier(CLK_LOCAL_MEM_FENCE);
+    SYNC_THREADS;
    if (thread < 2)
        temp[thread] += temp[thread+2];
-    barrier(CLK_LOCAL_MEM_FENCE);
-#endif
+    SYNC_THREADS;
    if (thread == 0)
-        cmMomentum[get_group_id(0)] = temp[thread]+temp[thread+1];
+        cmMomentum[GROUP_ID] = temp[thread]+temp[thread+1];
 }

 /**
 * Remove center of mass motion.
 */

-__kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global mixed4* restrict velm, __global const float4* restrict cmMomentum, __local volatile float4* restrict temp) {
+KERNEL void removeCenterOfMassMomentum(int numAtoms, GLOBAL mixed4* RESTRICT velm, GLOBAL const float3* RESTRICT cmMomentum) {
    // First sum all of the momenta that were calculated by individual groups.

-    unsigned int index = get_local_id(0);
-    float4 cm = 0.0f;
-    while (index < get_num_groups(0)) {
+    LOCAL float3 temp[64];
+    float3 cm = make_float3(0, 0, 0);
+    for (int index = LOCAL_ID; index < NUM_GROUPS; index += LOCAL_SIZE)
        cm += cmMomentum[index];
-        index += get_local_size(0);
-    }
-    int thread = get_local_id(0);
+    int thread = LOCAL_ID;
    temp[thread] = cm;
-    barrier(CLK_LOCAL_MEM_FENCE);
-#ifdef WARPS_ARE_ATOMIC
-    if (thread < 32) {
-        temp[thread] += temp[thread+32];
-        if (thread < 16)
-            temp[thread] += temp[thread+16];
-        if (thread < 8)
-            temp[thread] += temp[thread+8];
-        if (thread < 4)
-            temp[thread] += temp[thread+4];
-        if (thread < 2)
-            temp[thread] += temp[thread+2];
-    }
-#else
+    SYNC_THREADS;
    if (thread < 32)
        temp[thread] += temp[thread+32];
-    barrier(CLK_LOCAL_MEM_FENCE);
+    SYNC_THREADS;
    if (thread < 16)
        temp[thread] += temp[thread+16];
-    barrier(CLK_LOCAL_MEM_FENCE);
+    SYNC_THREADS;
    if (thread < 8)
        temp[thread] += temp[thread+8];
-    barrier(CLK_LOCAL_MEM_FENCE);
+    SYNC_THREADS;
    if (thread < 4)
        temp[thread] += temp[thread+4];
-    barrier(CLK_LOCAL_MEM_FENCE);
+    SYNC_THREADS;
    if (thread < 2)
        temp[thread] += temp[thread+2];
-#endif
-    barrier(CLK_LOCAL_MEM_FENCE);
-    cm = (float4) (INVERSE_TOTAL_MASS*(temp[0].x+temp[1].x), INVERSE_TOTAL_MASS*(temp[0].y+temp[1].y), INVERSE_TOTAL_MASS*(temp[0].z+temp[1].z), 0);
+    SYNC_THREADS;
+    cm = make_float3(INVERSE_TOTAL_MASS*(temp[0].x+temp[1].x), INVERSE_TOTAL_MASS*(temp[0].y+temp[1].y), INVERSE_TOTAL_MASS*(temp[0].z+temp[1].z));

    // Now remove the center of mass velocity from each atom.

-    index = get_global_id(0);
-    while (index < numAtoms) {
+    for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
        velm[index].x -= cm.x;
        velm[index].y -= cm.y;
        velm[index].z -= cm.z;
-        index += get_global_size(0);
    }
 }
--- a/platforms/cuda/src/kernels/rmsd.cu
+++ b/platforms/cuda/src/kernels/rmsd.cu
@@ -4,20 +4,20 @@
 /**
 * Sum a value over all threads.
 */
-__device__ real reduceValue(real value, volatile real* temp) {
-    const int thread = threadIdx.x;
-    __syncthreads();
+DEVICE real reduceValue(real value, LOCAL_ARG volatile real* temp) {
+    const int thread = LOCAL_ID;
+    SYNC_THREADS;
    temp[thread] = value;
-    __syncthreads();
-    for (unsigned int step = 1; step < 32; step *= 2) {
-        if (thread+step < blockDim.x && thread%(2*step) == 0)
+    SYNC_THREADS;
+    for (int step = 1; step < 32; step *= 2) {
+        if (thread+step < LOCAL_SIZE && thread%(2*step) == 0)
            temp[thread] = temp[thread] + temp[thread+step];
-        SYNC_WARPS
+        SYNC_WARPS;
    }
-    for (unsigned int step = 32; step < blockDim.x; step *= 2) {
-        if (thread+step < blockDim.x && thread%(2*step) == 0)
+    for (int step = 32; step < LOCAL_SIZE; step *= 2) {
+        if (thread+step < LOCAL_SIZE && thread%(2*step) == 0)
            temp[thread] = temp[thread] + temp[thread+step];
-        __syncthreads();
+        SYNC_THREADS;
    }
    return temp[0];
 }
@@ -25,14 +25,14 @@ __device__ real reduceValue(real value, volatile real* temp) {
 /**
 * Perform the first step of computing the RMSD.  This is executed as a single work group.
 */
-extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __restrict__ posq, const real4* __restrict__ referencePos,
-         const int* __restrict__ particles, real* buffer) {
-    extern __shared__ volatile real temp[];
+KERNEL void computeRMSDPart1(int numParticles, GLOBAL const real4* RESTRICT posq, GLOBAL const real4* RESTRICT referencePos,
+        GLOBAL const int* RESTRICT particles, GLOBAL real* buffer) {
+    LOCAL volatile real temp[THREAD_BLOCK_SIZE];

    // Compute the center of the particle positions.
    
    real3 center = make_real3(0);
-    for (int i = threadIdx.x; i < numParticles; i += blockDim.x)
+    for (int i = LOCAL_ID; i < numParticles; i += LOCAL_SIZE)
        center += trimTo3(posq[particles[i]]);
    center.x = reduceValue(center.x, temp)/numParticles;
    center.y = reduceValue(center.y, temp)/numParticles;
@@ -42,7 +42,7 @@ extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __res
    
    real R[3][3] = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
    real sum = 0;
-    for (int i = threadIdx.x; i < numParticles; i += blockDim.x) {
+    for (int i = LOCAL_ID; i < numParticles; i += LOCAL_SIZE) {
        int index = particles[i];
        real3 pos = trimTo3(posq[index]) - center;
        real3 refPos = trimTo3(referencePos[index]);
@@ -64,7 +64,7 @@ extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __res

    // Copy everything into the output buffer to send back to the host.
    
-    if (threadIdx.x == 0) {
+    if (LOCAL_ID == 0) {
        for (int i = 0; i < 3; i++)
            for (int j = 0; j < 3; j++)
                buffer[3*i+j] = R[i][j];
@@ -78,11 +78,11 @@ extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __res
 /**
 * Apply forces based on the RMSD.
 */
-extern "C" __global__ void computeRMSDForces(int numParticles, int paddedNumAtoms, const real4* __restrict__ posq, const real4* __restrict__ referencePos,
-         const int* __restrict__ particles, const real* buffer, unsigned long long* __restrict__ forceBuffers) {
+KERNEL void computeRMSDForces(int numParticles, int paddedNumAtoms, GLOBAL const real4* RESTRICT posq, GLOBAL const real4* RESTRICT referencePos,
+        GLOBAL const int* RESTRICT particles, GLOBAL const real* buffer, GLOBAL mm_long* RESTRICT forceBuffers) {
    real3 center = make_real3(buffer[10], buffer[11], buffer[12]);
    real scale = 1 / (real) (buffer[9]*numParticles);
-    for (int i = blockDim.x*blockIdx.x+threadIdx.x; i < numParticles; i += blockDim.x*gridDim.x) {
+    for (int i = GLOBAL_ID; i < numParticles; i += GLOBAL_SIZE) {
        int index = particles[i];
        real3 pos = trimTo3(posq[index]) - center;
        real3 refPos = trimTo3(referencePos[index]);
@@ -90,8 +90,8 @@ extern "C" __global__ void computeRMSDForces(int numParticles, int paddedNumAtom
                                      buffer[1]*refPos.x + buffer[4]*refPos.y + buffer[7]*refPos.z,
                                      buffer[2]*refPos.x + buffer[5]*refPos.y + buffer[8]*refPos.z);
        real3 force = (rotatedRef-pos)*scale;
-        atomicAdd(&forceBuffers[index], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
-        atomicAdd(&forceBuffers[index+paddedNumAtoms], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
-        atomicAdd(&forceBuffers[index+2*paddedNumAtoms], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
+        forceBuffers[index] += (mm_long) (force.x*0x100000000);
+        forceBuffers[index+paddedNumAtoms] += (mm_long) (force.y*0x100000000);
+        forceBuffers[index+2*paddedNumAtoms] += (mm_long) (force.z*0x100000000);
    }
 }
--- a/platforms/common/src/kernels/torsionForce.cc
+++ b/platforms/common/src/kernels/torsionForce.cc
+const real PI = (real) 3.14159265358979323846;
+real3 v0 = make_real3(pos1.x-pos2.x, pos1.y-pos2.y, pos1.z-pos2.z);
+real3 v1 = make_real3(pos3.x-pos2.x, pos3.y-pos2.y, pos3.z-pos2.z);
+real3 v2 = make_real3(pos3.x-pos4.x, pos3.y-pos4.y, pos3.z-pos4.z);
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(v0)
+APPLY_PERIODIC_TO_DELTA(v1)
+APPLY_PERIODIC_TO_DELTA(v2)
+#endif
+real3 cp0 = cross(v0, v1);
+real3 cp1 = cross(v1, v2);
+real cosangle = dot(normalize(cp0), normalize(cp1));
+real theta;
+if (cosangle > 0.99f || cosangle < -0.99f) {
+    // We're close to the singularity in acos(), so take the cross product and use asin() instead.
+
+    real3 cross_prod = cross(cp0, cp1);
+    real scale = dot(cp0, cp0)*dot(cp1, cp1);
+    theta = ASIN(SQRT(dot(cross_prod, cross_prod)/scale));
+    if (cosangle < 0)
+        theta = PI-theta;
+}
+else
+   theta = ACOS(cosangle);
+theta = (dot(v0, cp1) >= 0 ? theta : -theta);
+COMPUTE_FORCE
+real normCross1 = dot(cp0, cp0);
+real normSqrBC = dot(v1, v1);
+real normBC = SQRT(normSqrBC);
+real normCross2 = dot(cp1, cp1);
+real dp = RECIP(normSqrBC);
+real4 ff = make_real4((-dEdAngle*normBC)/normCross1, dot(v0, v1)*dp, dot(v2, v1)*dp, (dEdAngle*normBC)/normCross2);
+real3 force1 = ff.x*cp0;
+real3 force4 = ff.w*cp1;
+real3 s = ff.y*force1 - ff.z*force4;
+real3 force2 = s-force1;
+real3 force3 = -s-force4;
--- a/platforms/cuda/src/kernels/verlet.cu
+++ b/platforms/cuda/src/kernels/verlet.cu
@@ -2,13 +2,17 @@
 * Perform the first step of Verlet integration.
 */

-extern "C" __global__ void integrateVerletPart1(int numAtoms, int paddedNumAtoms, const mixed2* __restrict__ dt, const real4* __restrict__ posq,
-        const real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta) {
+KERNEL void integrateVerletPart1(int numAtoms, int paddedNumAtoms, GLOBAL const mixed2* RESTRICT dt, GLOBAL const real4* RESTRICT posq,
+        GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL mixed4* RESTRICT posDelta
+#ifdef USE_MIXED_PRECISION
+        , GLOBAL const real4* RESTRICT posqCorrection
+#endif
+    ) {
    const mixed2 stepSize = dt[0];
    const mixed dtPos = stepSize.y;
    const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
    const mixed scale = dtVel/(mixed) 0x100000000;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
 #ifdef USE_MIXED_PRECISION
@@ -34,19 +38,24 @@ extern "C" __global__ void integrateVerletPart1(int numAtoms, int paddedNumAtoms
 * Perform the second step of Verlet integration.
 */

-extern "C" __global__ void integrateVerletPart2(int numAtoms, mixed2* __restrict__ dt, real4* __restrict__ posq,
-        real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) {
+KERNEL void integrateVerletPart2(int numAtoms, GLOBAL mixed2* RESTRICT dt, GLOBAL real4* RESTRICT posq,
+        GLOBAL mixed4* RESTRICT velm, GLOBAL const mixed4* RESTRICT posDelta
+#ifdef USE_MIXED_PRECISION
+        , GLOBAL real4* RESTRICT posqCorrection
+#endif
+    ) {
    mixed2 stepSize = dt[0];
-#if __CUDA_ARCH__ >= 130
+#ifdef SUPPORTS_DOUBLE_PRECISION
    double oneOverDt = 1.0/stepSize.y;
 #else
    float oneOverDt = 1.0f/stepSize.y;
    float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
 #endif
-    int index = blockIdx.x*blockDim.x+threadIdx.x;
-    if (index == 0)
+    if (GLOBAL_ID == 0)
        dt[0].x = stepSize.y;
-    for (; index < numAtoms; index += blockDim.x*gridDim.x) {
+    SYNC_THREADS;
+    int index = GLOBAL_ID;
+    for (; index < numAtoms; index += GLOBAL_SIZE) {
        mixed4 velocity = velm[index];
        if (velocity.w != 0.0) {
 #ifdef USE_MIXED_PRECISION
@@ -60,7 +69,7 @@ extern "C" __global__ void integrateVerletPart2(int numAtoms, mixed2* __restrict
            pos.x += delta.x;
            pos.y += delta.y;
            pos.z += delta.z;
-#if __CUDA_ARCH__ >= 130
+#ifdef SUPPORTS_DOUBLE_PRECISION
            velocity = make_mixed4((mixed) (delta.x*oneOverDt), (mixed) (delta.y*oneOverDt), (mixed) (delta.z*oneOverDt), velocity.w);
 #else
            velocity = make_mixed4((mixed) (delta.x*oneOverDt+delta.x*correction), (mixed) (delta.y*oneOverDt+delta.y*correction), (mixed) (delta.z*oneOverDt+delta.z*correction), velocity.w);
@@ -80,28 +89,28 @@ extern "C" __global__ void integrateVerletPart2(int numAtoms, mixed2* __restrict
 * Select the step size to use for the next step.
 */

-extern "C" __global__ void selectVerletStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed2* __restrict__ dt, const mixed4* __restrict__ velm, const long long* __restrict__ force) {
+KERNEL void selectVerletStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, GLOBAL mixed2* RESTRICT dt, GLOBAL const mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force) {
    // Calculate the error.

-    extern __shared__ mixed error[];
-    mixed err = 0.0f;
+    LOCAL mixed error[256];
+    mixed err = 0;
    const mixed scale = RECIP((mixed) 0x100000000);
-    for (int index = threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+    for (int index = LOCAL_ID; index < numAtoms; index += LOCAL_SIZE) {
        mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]);
        mixed invMass = velm[index].w;
        err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass*invMass;
    }
-    error[threadIdx.x] = err;
-    __syncthreads();
+    error[LOCAL_ID] = err;
+    SYNC_THREADS;

    // Sum the errors from all threads.

-    for (unsigned int offset = 1; offset < blockDim.x; offset *= 2) {
-        if (threadIdx.x+offset < blockDim.x && (threadIdx.x&(2*offset-1)) == 0)
-            error[threadIdx.x] += error[threadIdx.x+offset];
-        __syncthreads();
+    for (unsigned int offset = 1; offset < LOCAL_SIZE; offset *= 2) {
+        if (LOCAL_ID+offset < LOCAL_SIZE && (LOCAL_ID&(2*offset-1)) == 0)
+            error[LOCAL_ID] += error[LOCAL_ID+offset];
+        SYNC_THREADS;
    }
-    if (threadIdx.x == 0) {
+    if (LOCAL_ID == 0) {
        mixed totalError = SQRT(error[0]/(numAtoms*3));
        mixed newStepSize = SQRT(errorTol/totalError);
        mixed oldStepSize = dt[0].y;

--- a/platforms/cpu/include/CpuKernels.h
+++ b/platforms/cpu/include/CpuKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2019 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2020 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -32,7 +32,6 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

-#include "CpuBAOABDynamics.h"
 #include "CpuBondForce.h"
 #include "CpuCustomGBForce.h"
 #include "CpuCustomManyParticleForce.h"
@@ -40,6 +39,7 @@
 #include "CpuGayBerneForce.h"
 #include "CpuGBSAOBCForce.h"
 #include "CpuLangevinDynamics.h"
+#include "CpuLangevinMiddleDynamics.h"
 #include "CpuNeighborList.h"
 #include "CpuNonbondedForce.h"
 #include "CpuPlatform.h"
@@ -274,7 +274,7 @@ private:
    std::vector<std::vector<double> > bonded14ParamArray;
    double nonbondedCutoff, switchingDistance, rfDielectric, ewaldAlpha, ewaldDispersionAlpha, ewaldSelfEnergy, dispersionCoefficient;
    int kmax[3], gridSize[3], dispersionGridSize[3];
-    bool useSwitchingFunction, useOptimizedPme, hasInitializedPme, hasInitializedDispersionPme, hasParticleOffsets, hasExceptionOffsets;
+    bool useSwitchingFunction, exceptionsArePeriodic, useOptimizedPme, hasInitializedPme, hasInitializedDispersionPme, hasParticleOffsets, hasExceptionOffsets;
    std::vector<std::set<int> > exclusions;
    std::vector<std::pair<float, float> > particleParams;
    std::vector<float> C6params;
@@ -538,42 +538,38 @@ private:
 };

 /**
- * This kernel is invoked by BAOABLangevinIntegrator to take one time step.
+ * This kernel is invoked by LangevinMiddleIntegrator to take one time step.
 */
-class CpuIntegrateBAOABStepKernel : public IntegrateBAOABStepKernel {
+class CpuIntegrateLangevinMiddleStepKernel : public IntegrateLangevinMiddleStepKernel {
 public:
-    CpuIntegrateBAOABStepKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data) : IntegrateBAOABStepKernel(name, platform),
+    CpuIntegrateLangevinMiddleStepKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data) : IntegrateLangevinMiddleStepKernel(name, platform),
            data(data), dynamics(0) {
    }
-    ~CpuIntegrateBAOABStepKernel();
+    ~CpuIntegrateLangevinMiddleStepKernel();
    /**
     * Initialize the kernel, setting up the particle masses.
     * 
     * @param system     the System this kernel will be applied to
-     * @param integrator the BAOABLangevinIntegrator this kernel will be used for
+     * @param integrator the LangevinMiddleIntegrator this kernel will be used for
     */
-    void initialize(const System& system, const BAOABLangevinIntegrator& integrator);
+    void initialize(const System& system, const LangevinMiddleIntegrator& integrator);
    /**
     * Execute the kernel.
     * 
     * @param context    the context in which to execute this kernel
-     * @param integrator the BAOABLangevinIntegrator this kernel is being used for
-     * @param forcesAreValid if the context has been modified since the last time step, this will be
-     *                       false to show that cached forces are invalid and must be recalculated.
-     *                       On exit, this should specify whether the cached forces are valid at the
-     *                       end of the step.
+     * @param integrator the LangevinMiddleIntegrator this kernel is being used for
     */
-    void execute(ContextImpl& context, const BAOABLangevinIntegrator& integrator, bool& forcesAreValid);
+    void execute(ContextImpl& context, const LangevinMiddleIntegrator& integrator);
    /**
     * Compute the kinetic energy.
     * 
     * @param context    the context in which to execute this kernel
-     * @param integrator the BAOABLangevinIntegrator this kernel is being used for
+     * @param integrator the LangevinMiddleIntegrator this kernel is being used for
     */
-    double computeKineticEnergy(ContextImpl& context, const BAOABLangevinIntegrator& integrator);
+    double computeKineticEnergy(ContextImpl& context, const LangevinMiddleIntegrator& integrator);
 private:
    CpuPlatform::PlatformData& data;
-    CpuBAOABDynamics* dynamics;
+    CpuLangevinMiddleDynamics* dynamics;
    std::vector<double> masses;
    double prevTemp, prevFriction, prevStepSize;
 };

--- a/platforms/cpu/include/CpuBAOABDynamics.h
+++ b/platforms/cpu/include/CpuBAOABDynamics.h

-/* Portions copyright (c) 2013-2019 Stanford University and Simbios.
+/* Portions copyright (c) 2013-2020 Stanford University and Simbios.
 * Authors: Peter Eastman
 * Contributors: 
 *
@@ -23,17 +23,17 @@
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

-#ifndef __CPU_BAOAB_DYNAMICS_H__
-#define __CPU_BAOAB_DYNAMICS_H__
+#ifndef __CPU_LANGEVIN_MIDDLE_DYNAMICS_H__
+#define __CPU_LANGEVIN_MIDDLE_DYNAMICS_H__

-#include "ReferenceBAOABDynamics.h"
+#include "ReferenceLangevinMiddleDynamics.h"
 #include "CpuRandom.h"
 #include "openmm/internal/ThreadPool.h"
 #include "sfmt/SFMT.h"

 namespace OpenMM {

-class CpuBAOABDynamics : public ReferenceBAOABDynamics {
+class CpuLangevinMiddleDynamics : public ReferenceLangevinMiddleDynamics {
 public:
    /**
     * Constructor.
@@ -45,25 +45,22 @@ public:
     * @param threads        thread pool for parallelizing computation
     * @param random         random number generator
     */
-    CpuBAOABDynamics(int numberOfAtoms, double deltaT, double friction, double temperature, OpenMM::ThreadPool& threads, OpenMM::CpuRandom& random);
+    CpuLangevinMiddleDynamics(int numberOfAtoms, double deltaT, double friction, double temperature, OpenMM::ThreadPool& threads, OpenMM::CpuRandom& random);

    /**
     * Destructor.
     */
-    ~CpuBAOABDynamics();
+    ~CpuLangevinMiddleDynamics();

    /**
     * First update step.
     * 
     * @param numberOfAtoms       number of atoms
-     * @param atomCoordinates     atom coordinates
     * @param velocities          velocities
     * @param forces              forces
     * @param inverseMasses       inverse atom masses
-     * @param xPrime              xPrime
     */
-    void updatePart1(int numberOfAtoms, std::vector<OpenMM::Vec3>& atomCoordinates, std::vector<OpenMM::Vec3>& velocities,
-                     std::vector<OpenMM::Vec3>& forces, std::vector<double>& inverseMasses, std::vector<OpenMM::Vec3>& xPrime);
+    void updatePart1(int numberOfAtoms, std::vector<OpenMM::Vec3>& velocities,  std::vector<OpenMM::Vec3>& forces, std::vector<double>& inverseMasses);
      
    /**
     * Second update step.
@@ -94,7 +91,6 @@ private:
    void threadUpdate1(int threadIndex);
    void threadUpdate2(int threadIndex);
    void threadUpdate3(int threadIndex);
-    void threadUpdate4(int threadIndex);
    OpenMM::ThreadPool& threads;
    OpenMM::CpuRandom& random;
    std::vector<OpenMM_SFMT::SFMT> threadRandom;
@@ -109,4 +105,4 @@ private:

 } // namespace OpenMM

-#endif // __CPU_BAOAB_DYNAMICS_H__
+#endif // __CPU_LANGEVIN_MIDDLE_DYNAMICS_H__