Merge https://github.com/openmm/openmm

5a06df78 · tic20 · 8dd60914 · a9223eea · 5a06df78 · 5a06df78
Commit 5a06df78 authored Mar 04, 2020 by tic20
20 changed files
--- a/platforms/cuda/src/kernels/andersenThermostat.cu
+++ b/platforms/cuda/src/kernels/andersenThermostat.cu
@@ -2,11 +2,11 @@
 * Apply the Andersen thermostat to adjust particle velocities.
 */

-extern "C" __global__ void applyAndersenThermostat(int numAtoms, float collisionFrequency, float kT, mixed4* velm, const mixed4* __restrict__ stepSize, const float4* __restrict__ random,
-        unsigned int randomIndex, const int* __restrict__ atomGroups) {
-    float collisionProbability = 1.0f-expf(-(float) (collisionFrequency*stepSize[0].y));
-    float randomRange = erff(collisionProbability/sqrtf(2.0f));
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+KERNEL void applyAndersenThermostat(int numAtoms, float collisionFrequency, float kT, GLOBAL mixed4* velm, real stepSize, GLOBAL const float4* RESTRICT random,
+        unsigned int randomIndex, GLOBAL const int* RESTRICT atomGroups) {
+    float collisionProbability = (float) (1-EXP(-collisionFrequency*stepSize));
+    float randomRange = (float) erf(collisionProbability/SQRT(2.0f));
+    for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
        mixed4 velocity = velm[index];
        float4 selectRand = random[randomIndex+atomGroups[index]];
        float4 velRand = random[randomIndex+index];

--- a/platforms/common/src/kernels/angleForce.cc
+++ b/platforms/common/src/kernels/angleForce.cc
+real3 v0 = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+real3 v1 = make_real3(pos2.x-pos3.x, pos2.y-pos3.y, pos2.z-pos3.z);
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(v0)
+APPLY_PERIODIC_TO_DELTA(v1)
+#endif
+real3 cp = cross(v0, v1);
+real rp = cp.x*cp.x + cp.y*cp.y + cp.z*cp.z;
+rp = max(SQRT(rp), (real) 1.0e-06f);
+real r21 = v0.x*v0.x + v0.y*v0.y + v0.z*v0.z;
+real r23 = v1.x*v1.x + v1.y*v1.y + v1.z*v1.z;
+real dot = v0.x*v1.x + v0.y*v1.y + v0.z*v1.z;
+real cosine = min(max(dot*RSQRT(r21*r23), (real) -1), (real) 1);
+real theta = ACOS(cosine);
+COMPUTE_FORCE
+real3 force1 = cross(v0, cp)*(dEdAngle/(r21*rp));
+real3 force3 = cross(cp, v1)*(dEdAngle/(r23*rp));
+real3 force2 = -force1-force3;
--- a/platforms/common/src/kernels/bondForce.cc
+++ b/platforms/common/src/kernels/bondForce.cc
+real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+real r = SQRT(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
+COMPUTE_FORCE
+dEdR = (r > 0) ? (dEdR / r) : 0;
+delta *= dEdR;
+real3 force1 = delta;
+real3 force2 = -delta;
--- a/platforms/cuda/src/kernels/brownian.cu
+++ b/platforms/cuda/src/kernels/brownian.cu
@@ -2,18 +2,18 @@
 * Perform the first step of Brownian integration.
 */

-extern "C" __global__ void integrateBrownianPart1(int numAtoms, int paddedNumAtoms, mixed tauDeltaT, mixed noiseAmplitude, const long long* __restrict__ force,
-        mixed4* __restrict__ posDelta, const mixed4* __restrict__ velm, const float4* __restrict__ random, unsigned int randomIndex) {
-    randomIndex += blockIdx.x*blockDim.x+threadIdx.x;
+KERNEL void integrateBrownianPart1(int numAtoms, int paddedNumAtoms, mixed tauDeltaT, mixed noiseAmplitude, GLOBAL const mm_long* RESTRICT force,
+        GLOBAL mixed4* RESTRICT posDelta, GLOBAL const mixed4* RESTRICT velm, GLOBAL const float4* RESTRICT random, unsigned int randomIndex) {
+    randomIndex += GLOBAL_ID;
    const mixed fscale = tauDeltaT/(mixed) 0x100000000;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
        mixed invMass = velm[index].w;
        if (invMass != 0) {
            posDelta[index].x = fscale*invMass*force[index] + noiseAmplitude*SQRT(invMass)*random[randomIndex].x;
            posDelta[index].y = fscale*invMass*force[index+paddedNumAtoms] + noiseAmplitude*SQRT(invMass)*random[randomIndex].y;
            posDelta[index].z = fscale*invMass*force[index+paddedNumAtoms*2] + noiseAmplitude*SQRT(invMass)*random[randomIndex].z;
        }
-        randomIndex += blockDim.x*gridDim.x;
+        randomIndex += GLOBAL_SIZE;
    }
 }

@@ -21,9 +21,12 @@ extern "C" __global__ void integrateBrownianPart1(int numAtoms, int paddedNumAto
 * Perform the second step of Brownian integration.
 */

-extern "C" __global__ void integrateBrownianPart2(int numAtoms, mixed deltaT, real4* posq, real4* __restrict__ posqCorrection, mixed4* velm, const mixed4* __restrict__ posDelta) {
-    const mixed oneOverDeltaT = RECIP(deltaT);
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
+KERNEL void integrateBrownianPart2(int numAtoms, mixed oneOverDeltaT, GLOBAL real4* posq, GLOBAL mixed4* velm, GLOBAL const mixed4* RESTRICT posDelta
+#ifdef USE_MIXED_PRECISION
+        , GLOBAL real4* RESTRICT posqCorrection
+#endif
+        ) {
+    for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
        if (velm[index].w != 0) {
            mixed4 delta = posDelta[index];
            velm[index].x = oneOverDeltaT*delta.x;

--- a/platforms/cuda/src/kernels/cmapTorsionForce.cu
+++ b/platforms/cuda/src/kernels/cmapTorsionForce.cu
--- a/platforms/opencl/src/kernels/customCentroidBond.cl
+++ b/platforms/opencl/src/kernels/customCentroidBond.cl
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-
 /**
 * Compute the center of each group.
 */
-__kernel void computeGroupCenters(__global const real4* restrict posq, __global const int* restrict groupParticles,
-        __global const real* restrict groupWeights, __global const int* restrict groupOffsets, __global real4* restrict centerPositions) {
-    __local volatile real3 temp[64];
-    for (int group = get_group_id(0); group < NUM_GROUPS; group += get_num_groups(0)) {
+KERNEL void computeGroupCenters(int numParticleGroups, GLOBAL const real4* RESTRICT posq, GLOBAL const int* RESTRICT groupParticles,
+        GLOBAL const real* RESTRICT groupWeights, GLOBAL const int* RESTRICT groupOffsets, GLOBAL real4* RESTRICT centerPositions) {
+    LOCAL volatile real3 temp[64];
+    for (int group = GROUP_ID; group < numParticleGroups; group += NUM_GROUPS) {
        // The threads in this block work together to compute the center one group.

        int firstIndex = groupOffsets[group];
        int lastIndex = groupOffsets[group+1];
-        real3 center = (real3) 0;
-        for (int index = get_local_id(0); index < lastIndex-firstIndex; index += get_local_size(0)) {
+        real3 center = make_real3(0);
+        for (int index = LOCAL_ID; index < lastIndex-firstIndex; index += LOCAL_SIZE) {
            int atom = groupParticles[firstIndex+index];
            real weight = groupWeights[firstIndex+index];
            real4 pos = posq[atom];
@@ -23,18 +21,16 @@ __kernel void computeGroupCenters(__global const real4* restrict posq, __global

        // Sum the values.

-        int thread = get_local_id(0);
+        int thread = LOCAL_ID;
        temp[thread].x = center.x;
        temp[thread].y = center.y;
        temp[thread].z = center.z;
-
-        barrier(CLK_LOCAL_MEM_FENCE);
+        SYNC_THREADS;
        if (thread < 32) {
            temp[thread].x += temp[thread+32].x;
            temp[thread].y += temp[thread+32].y;
            temp[thread].z += temp[thread+32].z;
        }
-
        SYNC_WARPS;
        if (thread < 16) {
            temp[thread].x += temp[thread+16].x;
@@ -47,7 +43,6 @@ __kernel void computeGroupCenters(__global const real4* restrict posq, __global
            temp[thread].y += temp[thread+8].y;
            temp[thread].z += temp[thread+8].z;
        }
-
        SYNC_WARPS;
        if (thread < 4) {
            temp[thread].x += temp[thread+4].x;
@@ -60,19 +55,18 @@ __kernel void computeGroupCenters(__global const real4* restrict posq, __global
            temp[thread].y += temp[thread+2].y;
            temp[thread].z += temp[thread+2].z;
        }
-
        SYNC_WARPS;
        if (thread == 0)
-            centerPositions[group] = (real4) (temp[0].x+temp[1].x, temp[0].y+temp[1].y, temp[0].z+temp[1].z, 0);
+            centerPositions[group] = make_real4(temp[0].x+temp[1].x, temp[0].y+temp[1].y, temp[0].z+temp[1].z, 0);
    }
 }

 /**
 * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
 */
-real4 delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
+DEVICE real4 delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
-    real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
+    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
    if (periodic)
        APPLY_PERIODIC_TO_DELTA(result);
    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
@@ -82,65 +76,64 @@ real4 delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4
 /**
 * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
 */
-real computeAngle(real4 vec1, real4 vec2) {
+DEVICE real computeAngle(real4 vec1, real4 vec2) {
    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
    real angle;
    if (cosine > 0.99f || cosine < -0.99f) {
        // We're close to the singularity in acos(), so take the cross product and use asin() instead.

-        real4 crossProduct = cross(vec1, vec2);
+        real3 crossProduct = cross(trimTo3(vec1), trimTo3(vec2));
        real scale = vec1.w*vec2.w;
-        angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
+        angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
        if (cosine < 0)
            angle = M_PI-angle;
    }
    else
-       angle = acos(cosine);
+       angle = ACOS(cosine);
    return angle;
 }

 /**
 * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
 */
-real4 computeCross(real4 vec1, real4 vec2) {
-    real4 result = cross(vec1, vec2);
-    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
-    return result;
+DEVICE real4 computeCross(real4 vec1, real4 vec2) {
+    real3 cp = cross(trimTo3(vec1), trimTo3(vec2));
+    return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
 }

 /**
 * Compute the forces on groups based on the bonds.
 */
-__kernel void computeGroupForces(__global long* restrict groupForce, __global mixed* restrict energyBuffer, __global const real4* restrict centerPositions,
-        __global const int* restrict bondGroups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
+KERNEL void computeGroupForces(int numParticleGroups, GLOBAL mm_ulong* RESTRICT groupForce, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT centerPositions,
+        GLOBAL const int* RESTRICT bondGroups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        EXTRA_ARGS) {
    mixed energy = 0;
    INIT_PARAM_DERIVS
-    for (int index = get_global_id(0); index < NUM_BONDS; index += get_global_size(0)) {
+    for (int index = GLOBAL_ID; index < NUM_BONDS; index += GLOBAL_SIZE) {
        COMPUTE_FORCE
    }
-    energyBuffer[get_global_id(0)] += energy;
+    energyBuffer[GLOBAL_ID] += energy;
    SAVE_PARAM_DERIVS
 }

 /**
 * Apply the forces from the group centers to the individual atoms.
 */
-__kernel void applyForcesToAtoms(__global const int* restrict groupParticles, __global const real* restrict groupWeights, __global const int* restrict groupOffsets,
-        __global const long* restrict groupForce, __global long* restrict atomForce) {
-    for (int group = get_group_id(0); group < NUM_GROUPS; group += get_num_groups(0)) {
-        long fx = groupForce[group];
-        long fy = groupForce[group+NUM_GROUPS];
-        long fz = groupForce[group+NUM_GROUPS*2];
+KERNEL void applyForcesToAtoms(int numParticleGroups, GLOBAL const int* RESTRICT groupParticles, GLOBAL const real* RESTRICT groupWeights, GLOBAL const int* RESTRICT groupOffsets,
+        GLOBAL const mm_long* RESTRICT groupForce, GLOBAL mm_ulong* RESTRICT atomForce) {
+    for (int group = GROUP_ID; group < numParticleGroups; group += NUM_GROUPS) {
+        mm_long fx = groupForce[group];
+        mm_long fy = groupForce[group+numParticleGroups];
+        mm_long fz = groupForce[group+numParticleGroups*2];
        int firstIndex = groupOffsets[group];
        int lastIndex = groupOffsets[group+1];
-        for (int index = get_local_id(0); index < lastIndex-firstIndex; index += get_local_size(0)) {
+        for (int index = LOCAL_ID; index < lastIndex-firstIndex; index += LOCAL_SIZE) {
            int atom = groupParticles[firstIndex+index];
            real weight = groupWeights[firstIndex+index];
-            atom_add(&atomForce[atom], (long) (fx*weight));
-            atom_add(&atomForce[atom+PADDED_NUM_ATOMS], (long) (fy*weight));
-            atom_add(&atomForce[atom+2*PADDED_NUM_ATOMS], (long) (fz*weight));
+            ATOMIC_ADD(&atomForce[atom], (mm_ulong) ((mm_long) (fx*weight)));
+            ATOMIC_ADD(&atomForce[atom+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (fy*weight)));
+            ATOMIC_ADD(&atomForce[atom+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (fz*weight)));
        }
    }
 }
--- a/platforms/cuda/src/kernels/customCompoundBond.cu
+++ b/platforms/cuda/src/kernels/customCompoundBond.cu
-/**
- * Convert a real4 to a real3 by removing its last element.
- */
-inline __device__ real3 ccb_trim(real4 v) {
-    return make_real3(v.x, v.y, v.z);
-}
-
 /**
 * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
 */
-inline __device__ real4 ccb_delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
+DEVICE real4 ccb_delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
    if (periodic)
@@ -20,17 +13,17 @@ inline __device__ real4 ccb_delta(real4 vec1, real4 vec2, bool periodic, real4 p
 /**
 * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
 */
-__device__ real ccb_computeAngle(real4 vec1, real4 vec2) {
+DEVICE real ccb_computeAngle(real4 vec1, real4 vec2) {
    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
    real angle;
    if (cosine > 0.99f || cosine < -0.99f) {
        // We're close to the singularity in acos(), so take the cross product and use asin() instead.

-        real3 crossProduct = cross(vec1, vec2);
+        real3 crossProduct = cross(trimTo3(vec1), trimTo3(vec2));
        real scale = vec1.w*vec2.w;
        angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
-        if (cosine < 0.0f)
+        if (cosine < 0)
            angle = M_PI-angle;
    }
    else
@@ -41,7 +34,8 @@ __device__ real ccb_computeAngle(real4 vec1, real4 vec2) {
 /**
 * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
 */
-inline __device__ real4 ccb_computeCross(real4 vec1, real4 vec2) {
-    real3 cp = cross(vec1, vec2);
+DEVICE real4 ccb_computeCross(real4 vec1, real4 vec2) {
+    real3 cp = cross(trimTo3(vec1), trimTo3(vec2));
    return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
 }
+
--- a/platforms/cuda/src/kernels/customExternalForce.cu
+++ b/platforms/cuda/src/kernels/customExternalForce.cu
--- a/platforms/cuda/src/kernels/customGBChainRule.cu
+++ b/platforms/cuda/src/kernels/customGBChainRule.cu
--- a/platforms/opencl/src/kernels/customGBEnergyN2.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2.cl
 #ifdef SUPPORTS_64_BIT_ATOMICS
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-#define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0x100000000));
-#define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[get_local_id(0)]*0x100000000));
+#define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (deriv##INDEX##_1*0x100000000)));
+#define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_deriv##INDEX[LOCAL_ID]*0x100000000)));
 #else
 #define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
-#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[get_local_id(0)];
+#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[LOCAL_ID];
 #endif

 /**
 * Compute a force based on pair interactions.
 */
-__kernel void computeN2Energy(
+KERNEL void computeN2Energy(
 #ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict forceBuffers,
+        GLOBAL mm_ulong* RESTRICT forceBuffers,
 #else
-        __global real4* restrict forceBuffers,
+        GLOBAL real4* RESTRICT forceBuffers,
 #endif
-        __global mixed* restrict energyBuffer, __local real4* restrict local_force,
-        __global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
-        __global const ushort2* exclusionTiles, int needEnergy,
+        GLOBAL mixed* RESTRICT energyBuffer,
+        GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
+        GLOBAL const ushort2* exclusionTiles, int needEnergy,
 #ifdef USE_CUTOFF
-        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
-        __global const real4* restrict blockSize, __global const int* restrict interactingAtoms
+        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
+        GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms
 #else
        unsigned int numTiles
 #endif
        PARAMETER_ARGUMENTS) {
-    const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
-    const unsigned int warp = get_global_id(0)/TILE_SIZE;
-    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-    const unsigned int tbx = get_local_id(0) - tgx;
+    const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
+    const unsigned int warp = GLOBAL_ID/TILE_SIZE;
+    const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
+    const unsigned int tbx = LOCAL_ID - tgx;
    mixed energy = 0;
    INIT_PARAM_DERIVS
+    LOCAL real3 local_pos[LOCAL_BUFFER_SIZE];
+    LOCAL real3 local_force[LOCAL_BUFFER_SIZE];
+    ATOM_PARAMETER_DATA

    // First loop: process tiles that contain exclusions.
    
-    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    const int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    const int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
        const ushort2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
-        real4 force = 0;
+        real3 force = make_real3(0);
        DECLARE_ATOM1_DERIVATIVES
        unsigned int atom1 = x*TILE_SIZE + tgx;
-        real4 posq1 = posq[atom1];
+        real3 pos1 = trimTo3(posq[atom1]);
        LOAD_ATOM1_PARAMETERS
 #ifdef USE_EXCLUSIONS
        unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
@@ -53,14 +55,14 @@ __kernel void computeN2Energy(
        if (x == y) {
            // This tile is on the diagonal.

-            const unsigned int localAtomIndex = get_local_id(0);
-            local_posq[localAtomIndex] = posq1;
+            const unsigned int localAtomIndex = LOCAL_ID;
+            local_pos[localAtomIndex] = pos1;
            LOAD_LOCAL_PARAMETERS_FROM_1
            SYNC_WARPS;
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+j;
-                real4 posq2 = local_posq[atom2];
-                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                real3 pos2 = local_pos[atom2];
+                real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
 #ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -84,8 +86,10 @@ __kernel void computeN2Energy(
                    }
                    if (needEnergy)
                        energy += 0.5f*tempEnergy;
-                    delta.xyz *= dEdR;
-                    force.xyz -= delta.xyz;
+                    delta *= dEdR;
+                    force.x -= delta.x;
+                    force.y -= delta.y;
+                    force.z -= delta.z;
 #ifdef USE_CUTOFF
                }
 #endif
@@ -98,11 +102,11 @@ __kernel void computeN2Energy(
        else {
            // This is an off-diagonal tile.

-            const unsigned int localAtomIndex = get_local_id(0);
+            const unsigned int localAtomIndex = LOCAL_ID;
            unsigned int j = y*TILE_SIZE + tgx;
-            local_posq[localAtomIndex] = posq[j];
+            local_pos[localAtomIndex] = trimTo3(posq[j]);
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-            local_force[localAtomIndex] = 0;
+            local_force[localAtomIndex] = make_real3(0);
            CLEAR_LOCAL_DERIVATIVES
            SYNC_WARPS;
 #ifdef USE_EXCLUSIONS
@@ -111,8 +115,8 @@ __kernel void computeN2Energy(
            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+tj;
-                real4 posq2 = local_posq[atom2];
-                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                real3 pos2 = local_pos[atom2];
+                real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
 #ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -126,7 +130,7 @@ __kernel void computeN2Energy(
                    atom2 = y*TILE_SIZE+tj;
                    real dEdR = 0;
                    real tempEnergy = 0;
-                    const real interactionScale = 1.0f;
+                    const real interactionScale = 1;
 #ifdef USE_EXCLUSIONS
                    bool isExcluded = !(excl & 0x1);
 #endif
@@ -136,10 +140,12 @@ __kernel void computeN2Energy(
                    }
                    if (needEnergy)
                        energy += tempEnergy;
-                    delta.xyz *= dEdR;
-                    force.xyz -= delta.xyz;
+                    delta *= dEdR;
+                    force.x -= delta.x;
+                    force.y -= delta.y;
+                    force.z -= delta.z;
                    atom2 = tbx+tj;
-                    local_force[atom2].xyz += delta.xyz;
+                    local_force[atom2] += delta;
                    RECORD_DERIVATIVE_2
 #ifdef USE_CUTOFF
                }
@@ -151,20 +157,20 @@ __kernel void computeN2Energy(
                SYNC_WARPS;
            }
        }
-        
+
        // Write results.
-        
+
 #ifdef SUPPORTS_64_BIT_ATOMICS
        unsigned int offset = x*TILE_SIZE + tgx;
-        atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
-        atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-        atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+        ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
+        ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
+        ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
        STORE_DERIVATIVES_1
        if (x != y) {
            offset = y*TILE_SIZE + tgx;
-            atom_add(&forceBuffers[offset], (long) (local_force[get_local_id(0)].x*0x100000000));
-            atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].y*0x100000000));
-            atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].z*0x100000000));
+            ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].x*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].y*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].z*0x100000000)));
            STORE_DERIVATIVES_2
        }
 #else
@@ -175,7 +181,7 @@ __kernel void computeN2Energy(
        STORE_DERIVATIVES_1
        if (x != y) {
            offset = offset2;
-            forceBuffers[offset2] += (real4) (local_force[get_local_id(0)].x, local_force[get_local_id(0)].y, local_force[get_local_id(0)].z, 0.0f);
+            forceBuffers[offset2] += (real4) (local_force[LOCAL_ID].x, local_force[LOCAL_ID].y, local_force[LOCAL_ID].z, 0.0f);
            STORE_DERIVATIVES_2
        }
 #endif
@@ -188,21 +194,21 @@ __kernel void computeN2Energy(
    unsigned int numTiles = interactionCount[0];
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
-    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
-    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
+    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
+    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
 #else
-    int pos = (int) (warp*(long)numTiles/totalWarps);
-    int end = (int) ((warp+1)*(long)numTiles/totalWarps);
+    int pos = (int) (warp*(mm_long)numTiles/totalWarps);
+    int end = (int) ((warp+1)*(mm_long)numTiles/totalWarps);
 #endif
    int skipBase = 0;
    int currentSkipIndex = tbx;
-    __local int atomIndices[FORCE_WORK_GROUP_SIZE];
-    __local volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
-    skipTiles[get_local_id(0)] = -1;
+    LOCAL int atomIndices[LOCAL_BUFFER_SIZE];
+    LOCAL volatile int skipTiles[LOCAL_BUFFER_SIZE];
+    skipTiles[LOCAL_ID] = -1;

    while (pos < end) {
        const bool isExcluded = false;
-        real4 force = 0;
+        real3 force = make_real3(0);
        DECLARE_ATOM1_DERIVATIVES
        bool includeTile = true;

@@ -231,10 +237,10 @@ __kernel void computeN2Energy(
            SYNC_WARPS;
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
                ushort2 tile = exclusionTiles[skipBase+tgx];
-                skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
            else
-                skipTiles[get_local_id(0)] = end;
+                skipTiles[LOCAL_ID] = end;
            skipBase += TILE_SIZE;            
            currentSkipIndex = tbx;
            SYNC_WARPS;
@@ -247,20 +253,20 @@ __kernel void computeN2Energy(
            unsigned int atom1 = x*TILE_SIZE + tgx;

            // Load atom data for this tile.
-            
-            real4 posq1 = posq[atom1];
+
+            real3 pos1 = trimTo3(posq[atom1]);
            LOAD_ATOM1_PARAMETERS
-            const unsigned int localAtomIndex = get_local_id(0);
+            const unsigned int localAtomIndex = LOCAL_ID;
 #ifdef USE_CUTOFF
            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
 #else
            unsigned int j = y*TILE_SIZE + tgx;
 #endif
-            atomIndices[get_local_id(0)] = j;
+            atomIndices[LOCAL_ID] = j;
            if (j < PADDED_NUM_ATOMS) {
-                local_posq[localAtomIndex] = posq[j];
+                local_pos[localAtomIndex] = trimTo3(posq[j]);
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
-                local_force[localAtomIndex] = 0;
+                local_force[localAtomIndex] = make_real3(0);
                CLEAR_LOCAL_DERIVATIVES
            }
            SYNC_WARPS;
@@ -270,14 +276,14 @@ __kernel void computeN2Energy(
                // box, then skip having to apply periodic boundary conditions later.

                real4 blockCenterX = blockCenter[x];
-                APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
-                APPLY_PERIODIC_TO_POS_WITH_CENTER(local_posq[get_local_id(0)], blockCenterX)
+                APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
+                APPLY_PERIODIC_TO_POS_WITH_CENTER(local_pos[LOCAL_ID], blockCenterX)
                SYNC_WARPS;
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
-                    real4 posq2 = local_posq[atom2];
-                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                    real3 pos2 = local_pos[atom2];
+                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                    if (r2 < CUTOFF_SQUARED) {
                        real invR = RSQRT(r2);
@@ -286,17 +292,19 @@ __kernel void computeN2Energy(
                        atom2 = atomIndices[tbx+tj];
                        real dEdR = 0;
                        real tempEnergy = 0;
-                        const real interactionScale = 1.0f;
+                        const real interactionScale = 1;
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
                            COMPUTE_INTERACTION
                            dEdR /= -r;
                        }
                        if (needEnergy)
                            energy += tempEnergy;
-                        delta.xyz *= dEdR;
-                        force.xyz -= delta.xyz;
+                        delta *= dEdR;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
                        atom2 = tbx+tj;
-                        local_force[atom2].xyz += delta.xyz;
+                        local_force[atom2] += delta;
                        RECORD_DERIVATIVE_2
                    }
                    tj = (tj + 1) & (TILE_SIZE - 1);
@@ -311,8 +319,8 @@ __kernel void computeN2Energy(
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
-                    real4 posq2 = local_posq[atom2];
-                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                    real3 pos2 = local_pos[atom2];
+                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -326,17 +334,19 @@ __kernel void computeN2Energy(
                        atom2 = atomIndices[tbx+tj];
                        real dEdR = 0;
                        real tempEnergy = 0;
-                        const real interactionScale = 1.0f;
+                        const real interactionScale = 1;
                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
                            COMPUTE_INTERACTION
                            dEdR /= -r;
                        }
                        if (needEnergy)
                            energy += tempEnergy;
-                        delta.xyz *= dEdR;
-                        force.xyz -= delta.xyz;
+                        delta *= dEdR;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
                        atom2 = tbx+tj;
-                        local_force[atom2].xyz += delta.xyz;
+                        local_force[atom2] += delta;
                        RECORD_DERIVATIVE_2
 #ifdef USE_CUTOFF
                    }
@@ -347,22 +357,22 @@ __kernel void computeN2Energy(
            }
        
            // Write results.
-        
+
 #ifdef USE_CUTOFF
-            unsigned int atom2 = atomIndices[get_local_id(0)];
+            unsigned int atom2 = atomIndices[LOCAL_ID];
 #else
            unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
 #ifdef SUPPORTS_64_BIT_ATOMICS
-            atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
-            atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-            atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+            ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) ((mm_long) (force.x*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long)  (force.y*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
            unsigned int offset = atom1;
            STORE_DERIVATIVES_1
            if (atom2 < PADDED_NUM_ATOMS) {
-                atom_add(&forceBuffers[atom2], (long) (local_force[get_local_id(0)].x*0x100000000));
-                atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].y*0x100000000));
-                atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].z*0x100000000));
+                ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].x*0x100000000)));
+                ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].y*0x100000000)));
+                ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[LOCAL_ID].z*0x100000000)));
                offset = atom2;
                STORE_DERIVATIVES_2
            }
@@ -373,7 +383,7 @@ __kernel void computeN2Energy(
            unsigned int offset = offset1;
            STORE_DERIVATIVES_1
            if (atom2 < PADDED_NUM_ATOMS) {
-                forceBuffers[offset2] += (real4) (local_force[get_local_id(0)].x, local_force[get_local_id(0)].y, local_force[get_local_id(0)].z, 0.0f);
+                forceBuffers[offset2] += (real4) (local_force[LOCAL_ID].x, local_force[LOCAL_ID].y, local_force[LOCAL_ID].z, 0.0f);
                offset = offset2;
                STORE_DERIVATIVES_2
            }
@@ -381,6 +391,6 @@ __kernel void computeN2Energy(
        }
        pos++;
    }
-    energyBuffer[get_global_id(0)] += energy;
+    energyBuffer[GLOBAL_ID] += energy;
    SAVE_PARAM_DERIVS
 }
--- a/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
 #ifdef SUPPORTS_64_BIT_ATOMICS
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-#define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0x100000000));
-#define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[tgx]*0x100000000));
+#define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (deriv##INDEX##_1*0x100000000)));
+#define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_deriv##INDEX[tgx]*0x100000000)));
 #else
 #define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
 #define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[tgx];
@@ -10,30 +9,33 @@
 /**
 * Compute a force based on pair interactions.
 */
-__kernel void computeN2Energy(
+KERNEL void computeN2Energy(
 #ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict forceBuffers,
+        GLOBAL mm_ulong* RESTRICT forceBuffers,
 #else
-        __global real4* restrict forceBuffers,
+        GLOBAL real4* RESTRICT forceBuffers,
 #endif
-        __global mixed* restrict energyBuffer, __local real4* restrict local_force,
-        __global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
-        __global const ushort2* exclusionTiles, int needEnergy,
+        GLOBAL mixed* RESTRICT energyBuffer,
+        GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
+        GLOBAL const ushort2* exclusionTiles, int needEnergy,
 #ifdef USE_CUTOFF
-        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
-        __global const real4* restrict blockSize, __global const int* restrict interactingAtoms
+        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
+        GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms
 #else
        unsigned int numTiles
 #endif
        PARAMETER_ARGUMENTS) {
    mixed energy = 0;
    INIT_PARAM_DERIVS
+    LOCAL real3 local_pos[LOCAL_BUFFER_SIZE];
+    LOCAL real3 local_force[LOCAL_BUFFER_SIZE];
+    ATOM_PARAMETER_DATA

    // First loop: process tiles that contain exclusions.
    
-    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
-    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
+    const int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
+    const int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
        const ushort2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
@@ -43,7 +45,7 @@ __kernel void computeN2Energy(

        for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
            unsigned int j = y*TILE_SIZE + localAtomIndex;
-            local_posq[localAtomIndex] = posq[j];
+            local_pos[localAtomIndex] = trimTo3(posq[j]);
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
        }
        if (x == y) {
@@ -56,15 +58,15 @@ __kernel void computeN2Energy(
                unsigned int atom1 = x*TILE_SIZE+tgx;
                real4 force = 0;
                DECLARE_ATOM1_DERIVATIVES
-                real4 posq1 = posq[atom1];
+                real3 pos1 = trimTo3(posq[atom1]);
                LOAD_ATOM1_PARAMETERS
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    real4 posq2 = local_posq[j];
-                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                    real3 pos2 = local_pos[j];
+                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
-                    real r2 = dot(delta.xyz, delta.xyz);
+                    real r2 = dot(delta, delta);
 #ifdef USE_CUTOFF
                    if (r2 < CUTOFF_SQUARED) {
 #endif
@@ -84,8 +86,10 @@ __kernel void computeN2Energy(
                            dEdR /= -r;
                        }
                        energy += 0.5f*tempEnergy;
-                        delta.xyz *= dEdR;
-                        force.xyz -= delta.xyz;
+                        delta *= dEdR;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
 #ifdef USE_CUTOFF
                    }
 #endif
@@ -98,12 +102,12 @@ __kernel void computeN2Energy(

 #ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = atom1;
-                atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
-                atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-                atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+                ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
+                ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
+                ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
                STORE_DERIVATIVES_1
 #else
-                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
                forceBuffers[offset].xyz += force.xyz;
                STORE_DERIVATIVES_1
 #endif
@@ -123,11 +127,11 @@ __kernel void computeN2Energy(
                unsigned int atom1 = x*TILE_SIZE+tgx;
                real4 force = 0;
                DECLARE_ATOM1_DERIVATIVES
-                real4 posq1 = posq[atom1];
+                real3 pos1 = trimTo3(posq[atom1]);
                LOAD_ATOM1_PARAMETERS
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    real4 posq2 = local_posq[j];
-                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                    real3 pos2 = local_pos[j];
+                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -153,8 +157,10 @@ __kernel void computeN2Energy(
                            dEdR /= -r;
                        }
                        energy += tempEnergy;
-                        delta.xyz *= dEdR;
-                        force.xyz -= delta.xyz;
+                        delta *= dEdR;
+                        force.x -= delta.x;
+                        force.y -= delta.y;
+                        force.z -= delta.z;
                        atom2 = j;
                        local_force[atom2].xyz += delta.xyz;
                        RECORD_DERIVATIVE_2
@@ -170,12 +176,12 @@ __kernel void computeN2Energy(

 #ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = atom1;
-                atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
-                atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-                atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+                ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
+                ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
+                ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
                STORE_DERIVATIVES_1
 #else
-                unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
                forceBuffers[offset].xyz += force.xyz;
                STORE_DERIVATIVES_1
 #endif
@@ -186,12 +192,12 @@ __kernel void computeN2Energy(
            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
 #ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset = y*TILE_SIZE+tgx;
-                atom_add(&forceBuffers[offset], (long) (local_force[tgx].x*0x100000000));
-                atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[tgx].y*0x100000000));
-                atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[tgx].z*0x100000000));
+                ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (local_force[tgx].x*0x100000000)));
+                ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[tgx].y*0x100000000)));
+                ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[tgx].z*0x100000000)));
                STORE_DERIVATIVES_2
 #else
-                unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
                forceBuffers[offset].xyz += local_force[tgx].xyz;
                STORE_DERIVATIVES_2
 #endif
@@ -206,15 +212,15 @@ __kernel void computeN2Energy(
    const unsigned int numTiles = interactionCount[0];
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
-    int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
-    int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
+    int pos = (int) (GROUP_ID*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
+    int end = (int) ((GROUP_ID+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
 #else
-    int pos = (int) (get_group_id(0)*(long)numTiles/get_num_groups(0));
-    int end = (int) ((get_group_id(0)+1)*(long)numTiles/get_num_groups(0));
+    int pos = (int) (GROUP_ID*(mm_long)numTiles/NUM_GROUPS);
+    int end = (int) ((GROUP_ID+1)*(mm_long)numTiles/NUM_GROUPS);
 #endif
    int nextToSkip = -1;
    int currentSkipIndex = 0;
-    __local int atomIndices[TILE_SIZE];
+    LOCAL int atomIndices[TILE_SIZE];

    while (pos < end) {
        const bool isExcluded = false;
@@ -261,7 +267,7 @@ __kernel void computeN2Energy(
 #endif
                atomIndices[localAtomIndex] = j;
                if (j < PADDED_NUM_ATOMS) {
-                    local_posq[localAtomIndex] = posq[j];
+                    local_pos[localAtomIndex] = trimTo3(posq[j]);
                    LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
                    local_force[localAtomIndex] = 0;
                    CLEAR_LOCAL_DERIVATIVES
@@ -274,17 +280,17 @@ __kernel void computeN2Energy(

                real4 blockCenterX = blockCenter[x];
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++)
-                    APPLY_PERIODIC_TO_POS_WITH_CENTER(local_posq[tgx], blockCenterX)
+                    APPLY_PERIODIC_TO_POS_WITH_CENTER(local_pos[tgx], blockCenterX)
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
                    real4 force = 0;
                    DECLARE_ATOM1_DERIVATIVES
-                    real4 posq1 = posq[atom1];
-                    APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
+                    real3 pos1 = trimTo3(posq[atom1]);
+                    APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
                    LOAD_ATOM1_PARAMETERS
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        real4 posq2 = local_posq[j];
-                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                        real3 pos2 = local_pos[j];
+                        real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
                        real r2 = dot(delta.xyz, delta.xyz);
                        if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
                            real invR = RSQRT(r2);
@@ -298,8 +304,10 @@ __kernel void computeN2Energy(
                            COMPUTE_INTERACTION
                            dEdR /= -r;
                            energy += tempEnergy;
-                            delta.xyz *= dEdR;
-                            force.xyz -= delta.xyz;
+                            delta *= dEdR;
+                            force.x -= delta.x;
+                            force.y -= delta.y;
+                            force.z -= delta.z;
                            atom2 = j;
                            local_force[atom2].xyz += delta.xyz;
                            RECORD_DERIVATIVE_2
@@ -310,12 +318,12 @@ __kernel void computeN2Energy(

 #ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset = atom1;
-                    atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
-                    atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-                    atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+                    ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
+                    ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
+                    ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
                    STORE_DERIVATIVES_1
 #else
-                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
                    forceBuffers[offset].xyz += force.xyz;
                    STORE_DERIVATIVES_1
 #endif
@@ -330,11 +338,11 @@ __kernel void computeN2Energy(
                    unsigned int atom1 = x*TILE_SIZE+tgx;
                    real4 force = 0;
                    DECLARE_ATOM1_DERIVATIVES
-                    real4 posq1 = posq[atom1];
+                    real3 pos1 = trimTo3(posq[atom1]);
                    LOAD_ATOM1_PARAMETERS
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        real4 posq2 = local_posq[j];
-                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                        real3 pos2 = local_pos[j];
+                        real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
 #ifdef USE_PERIODIC
                        APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -355,10 +363,12 @@ __kernel void computeN2Energy(
                            COMPUTE_INTERACTION
                            dEdR /= -r;
                            energy += tempEnergy;
-                            delta.xyz *= dEdR;
-                            force.xyz -= delta.xyz;
+                            delta *= dEdR;
+                            force.x -= delta.x;
+                            force.y -= delta.y;
+                            force.z -= delta.z;
                            atom2 = j;
-                            local_force[atom2].xyz += delta.xyz;
+                            local_force[atom2] += delta;
                            RECORD_DERIVATIVE_2
                        }
                    }
@@ -367,12 +377,12 @@ __kernel void computeN2Energy(

 #ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset = atom1;
-                    atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
-                    atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-                    atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+                    ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
+                    ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
+                    ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
                    STORE_DERIVATIVES_1
 #else
-                    unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
                    forceBuffers[offset].xyz += force.xyz;
                    STORE_DERIVATIVES_1
 #endif
@@ -389,13 +399,13 @@ __kernel void computeN2Energy(
 #endif
                if (atom2 < PADDED_NUM_ATOMS) {
 #ifdef SUPPORTS_64_BIT_ATOMICS
-                    atom_add(&forceBuffers[atom2], (long) (local_force[tgx].x*0x100000000));
-                    atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (local_force[tgx].y*0x100000000));
-                    atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (local_force[tgx].z*0x100000000));
+                    ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) ((mm_long) (local_force[tgx].x*0x100000000)));
+                    ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[tgx].y*0x100000000)));
+                    ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_force[tgx].z*0x100000000)));
                    unsigned int offset = atom2;
                    STORE_DERIVATIVES_2
 #else
-                    unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
+                    unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
                    forceBuffers[offset].xyz += local_force[tgx].xyz;
                    STORE_DERIVATIVES_2
 #endif
@@ -404,6 +414,6 @@ __kernel void computeN2Energy(
        }
        pos++;
    }
-    energyBuffer[get_global_id(0)] += energy;
+    energyBuffer[GLOBAL_ID] += energy;
    SAVE_PARAM_DERIVS
 }
--- a/platforms/opencl/src/kernels/customGBEnergyPerParticle.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyPerParticle.cl
@@ -9,24 +9,29 @@
 * Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
 */

-__kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global real4* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq
+KERNEL void computePerParticleEnergy(GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq,
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        GLOBAL mm_long* RESTRICT forceBuffers
+#else
+        GLOBAL real4* RESTRICT forceBuffers, int bufferSize, int numBuffers
+#endif
        PARAMETER_ARGUMENTS) {
    mixed energy = 0;
    INIT_PARAM_DERIVS
-    unsigned int index = get_global_id(0);
-    while (index < NUM_ATOMS) {
+    for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
        // Reduce the derivatives

+#ifndef SUPPORTS_64_BIT_ATOMICS
        int totalSize = bufferSize*numBuffers;
+#endif
        REDUCE_DERIVATIVES

        // Now calculate the per-particle energy terms.

        real4 pos = posq[index];
-        real4 force = (real4) 0;
+        real3 force = make_real3(0, 0, 0);
        COMPUTE_ENERGY
-        index += get_global_size(0);
    }
-    energyBuffer[get_global_id(0)] += energy;
+    energyBuffer[GLOBAL_ID] += energy;
    SAVE_PARAM_DERIVS
 }
--- a/platforms/cuda/src/kernels/customGBGradientChainRule.cu
+++ b/platforms/cuda/src/kernels/customGBGradientChainRule.cu
@@ -2,17 +2,30 @@
 * Compute chain rule terms for computed values that depend explicitly on particle coordinates.
 */

-extern "C" __global__ void computeGradientChainRuleTerms(long long* __restrict__ forceBuffers, const real4* __restrict__ posq
+KERNEL void computeGradientChainRuleTerms(GLOBAL const real4* RESTRICT posq,
+#ifdef SUPPORTS_64_BIT_ATOMICS
+    GLOBAL mm_long* RESTRICT forceBuffers
+#else
+    GLOBAL real4* RESTRICT forceBuffers
+#endif
        PARAMETER_ARGUMENTS) {
    INIT_PARAM_DERIVS
    const real scale = RECIP((real) 0x100000000);
-    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
        real4 pos = posq[index];
+#ifdef SUPPORTS_64_BIT_ATOMICS
        real3 force = make_real3(scale*forceBuffers[index], scale*forceBuffers[index+PADDED_NUM_ATOMS], scale*forceBuffers[index+PADDED_NUM_ATOMS*2]);
+#else
+        real3 force = trimTo3(forceBuffers[index]);
+#endif
        COMPUTE_FORCES
-        forceBuffers[index] = (long long) (force.x*0x100000000);
-        forceBuffers[index+PADDED_NUM_ATOMS] = (long long) (force.y*0x100000000);
-        forceBuffers[index+PADDED_NUM_ATOMS*2] = (long long) (force.z*0x100000000);
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        forceBuffers[index] = (mm_long) (force.x*0x100000000);
+        forceBuffers[index+PADDED_NUM_ATOMS] = (mm_long) (force.y*0x100000000);
+        forceBuffers[index+PADDED_NUM_ATOMS*2] = (mm_long) (force.z*0x100000000);
+#else
+        forceBuffers[index] = make_real4(force.x, force.y, force.z, 0);
+#endif
    }
    SAVE_PARAM_DERIVS
 }
--- a/platforms/opencl/src/kernels/customGBValueN2.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2.cl
-#ifdef SUPPORTS_64_BIT_ATOMICS
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-#endif
-
 /**
 * Compute a value based on pair interactions.
 */
-__kernel void computeN2Value(__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
-        __global const ushort2* exclusionTiles,
+KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
+        GLOBAL const ushort2* exclusionTiles,
 #ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict global_value,
+        GLOBAL mm_ulong* RESTRICT global_value,
 #else
-        __global real* restrict global_value,
+        GLOBAL real* RESTRICT global_value,
 #endif
-        __local real* restrict local_value,
 #ifdef USE_CUTOFF
-        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
-        __global const real4* restrict blockSize, __global const int* restrict interactingAtoms
+        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
+        GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms
 #else
        unsigned int numTiles
 #endif
        PARAMETER_ARGUMENTS) {
-    const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
-    const unsigned int warp = get_global_id(0)/TILE_SIZE;
-    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
-    const unsigned int tbx = get_local_id(0) - tgx;
+    const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
+    const unsigned int warp = GLOBAL_ID/TILE_SIZE;
+    const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
+    const unsigned int tbx = LOCAL_ID - tgx;
+    LOCAL real3 local_pos[LOCAL_BUFFER_SIZE];
+    LOCAL real local_value[LOCAL_BUFFER_SIZE];
+    ATOM_PARAMETER_DATA

    // First loop: process tiles that contain exclusions.
    
-    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
-    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    const int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
+    const int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
        const ushort2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
        const unsigned int y = tileIndices.y;
        real value = 0;
        unsigned int atom1 = x*TILE_SIZE + tgx;
-        real4 posq1 = posq[atom1];
+        real3 pos1 = trimTo3(posq[atom1]);
        LOAD_ATOM1_PARAMETERS
 #ifdef USE_EXCLUSIONS
        unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
@@ -44,14 +42,14 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
        if (x == y) {
            // This tile is on the diagonal.

-            const unsigned int localAtomIndex = get_local_id(0);
-            local_posq[localAtomIndex] = posq1;
+            const unsigned int localAtomIndex = LOCAL_ID;
+            local_pos[localAtomIndex] = pos1;
            LOAD_LOCAL_PARAMETERS_FROM_1
            SYNC_WARPS;
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+j;
-                real4 posq2 = local_posq[atom2];
-                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                real3 pos2 = local_pos[atom2];
+                real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
 #ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -87,9 +85,9 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
        else {
            // This is an off-diagonal tile.

-            const unsigned int localAtomIndex = get_local_id(0);
+            const unsigned int localAtomIndex = LOCAL_ID;
            unsigned int j = y*TILE_SIZE + tgx;
-            local_posq[localAtomIndex] = posq[j];
+            local_pos[localAtomIndex] = trimTo3(posq[j]);
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
            local_value[localAtomIndex] = 0;
            SYNC_WARPS;
@@ -99,8 +97,8 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
                int atom2 = tbx+tj;
-                real4 posq2 = local_posq[atom2];
-                real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                real3 pos2 = local_pos[atom2];
+                real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
 #ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -141,11 +139,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*

 #ifdef SUPPORTS_64_BIT_ATOMICS
        unsigned int offset1 = x*TILE_SIZE + tgx;
-        atom_add(&global_value[offset1], (long) (value*0x100000000));
+        ATOMIC_ADD(&global_value[offset1], (mm_ulong) ((mm_long) (value*0x100000000)));
        STORE_PARAM_DERIVS1
        if (x != y) {
            unsigned int offset2 = y*TILE_SIZE + tgx;
-            atom_add(&global_value[offset2], (long) (local_value[get_local_id(0)]*0x100000000));
+            ATOMIC_ADD(&global_value[offset2], (mm_ulong) ((mm_long) (local_value[LOCAL_ID]*0x100000000)));
            STORE_PARAM_DERIVS2
        }
 #else
@@ -154,7 +152,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
        global_value[offset1] += value;
        STORE_PARAM_DERIVS1
        if (x != y) {
-            global_value[offset2] += local_value[get_local_id(0)];
+            global_value[offset2] += local_value[LOCAL_ID];
            STORE_PARAM_DERIVS2
        }
 #endif
@@ -167,17 +165,17 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
    unsigned int numTiles = interactionCount[0];
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
-    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
-    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
+    int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
+    int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
 #else
-    int pos = (int) (warp*(long)numTiles/totalWarps);
-    int end = (int) ((warp+1)*(long)numTiles/totalWarps);
+    int pos = (int) (warp*(mm_long)numTiles/totalWarps);
+    int end = (int) ((warp+1)*(mm_long)numTiles/totalWarps);
 #endif
    int skipBase = 0;
    int currentSkipIndex = tbx;
-    __local int atomIndices[FORCE_WORK_GROUP_SIZE];
-    __local volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
-    skipTiles[get_local_id(0)] = -1;
+    LOCAL int atomIndices[LOCAL_BUFFER_SIZE];
+    LOCAL volatile int skipTiles[LOCAL_BUFFER_SIZE];
+    skipTiles[LOCAL_ID] = -1;

    while (pos < end) {
        real value = 0;
@@ -208,10 +206,10 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
            SYNC_WARPS;
            if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
                ushort2 tile = exclusionTiles[skipBase+tgx];
-                skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
+                skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
            }
            else
-                skipTiles[get_local_id(0)] = end;
+                skipTiles[LOCAL_ID] = end;
            skipBase += TILE_SIZE;            
            currentSkipIndex = tbx;
            SYNC_WARPS;
@@ -225,17 +223,17 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*

            // Load atom data for this tile.
            
-            real4 posq1 = posq[atom1];
+            real3 pos1 = trimTo3(posq[atom1]);
            LOAD_ATOM1_PARAMETERS
-            const unsigned int localAtomIndex = get_local_id(0);
+            const unsigned int localAtomIndex = LOCAL_ID;
 #ifdef USE_CUTOFF
            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
 #else
            unsigned int j = y*TILE_SIZE + tgx;
 #endif
-            atomIndices[get_local_id(0)] = j;
+            atomIndices[LOCAL_ID] = j;
            if (j < PADDED_NUM_ATOMS) {
-                local_posq[localAtomIndex] = posq[j];
+                local_pos[localAtomIndex] = trimTo3(posq[j]);
                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
                local_value[localAtomIndex] = 0;
            }
@@ -246,14 +244,14 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
                // box, then skip having to apply periodic boundary conditions later.

                real4 blockCenterX = blockCenter[x];
-                APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
-                APPLY_PERIODIC_TO_POS_WITH_CENTER(local_posq[get_local_id(0)], blockCenterX)
+                APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
+                APPLY_PERIODIC_TO_POS_WITH_CENTER(local_pos[LOCAL_ID], blockCenterX)
                SYNC_WARPS;
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
-                    real4 posq2 = local_posq[atom2];
-                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                    real3 pos2 = local_pos[atom2];
+                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                    if (r2 < CUTOFF_SQUARED) {
                        real invR = RSQRT(r2);
@@ -278,12 +276,12 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
 #endif
            {
                // We need to apply periodic boundary conditions separately for each interaction.
-                
+
                unsigned int tj = tgx;
                for (j = 0; j < TILE_SIZE; j++) {
                    int atom2 = tbx+tj;
-                    real4 posq2 = local_posq[atom2];
-                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                    real3 pos2 = local_pos[atom2];
+                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -313,19 +311,19 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
            }
        
            // Write results.
-        
+
 #ifdef USE_CUTOFF
-            unsigned int atom2 = atomIndices[get_local_id(0)];
+            unsigned int atom2 = atomIndices[LOCAL_ID];
 #else
            unsigned int atom2 = y*TILE_SIZE + tgx;
 #endif
 #ifdef SUPPORTS_64_BIT_ATOMICS
            unsigned int offset1 = atom1;
-            atom_add(&global_value[offset1], (long) (value*0x100000000));
+            ATOMIC_ADD(&global_value[offset1], (mm_ulong) ((mm_long) (value*0x100000000)));
            STORE_PARAM_DERIVS1
            if (atom2 < PADDED_NUM_ATOMS) {
                unsigned int offset2 = atom2;
-                atom_add(&global_value[offset2], (long) (local_value[get_local_id(0)]*0x100000000));
+                ATOMIC_ADD(&global_value[offset2], (mm_ulong) ((mm_long) (local_value[LOCAL_ID]*0x100000000)));
                STORE_PARAM_DERIVS2
            }
 #else
@@ -334,7 +332,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
            STORE_PARAM_DERIVS1
            if (atom2 < PADDED_NUM_ATOMS) {
                unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
-                global_value[offset2] += local_value[get_local_id(0)];
+                global_value[offset2] += local_value[LOCAL_ID];
                STORE_PARAM_DERIVS2
            }
 #endif

--- a/platforms/opencl/src/kernels/customGBValueN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBValueN2_cpu.cl
-#ifdef SUPPORTS_64_BIT_ATOMICS
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-#endif
-
 /**
 * Compute a value based on pair interactions.
 */
-__kernel void computeN2Value(__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
-        __global const ushort2* exclusionTiles,
+KERNEL void computeN2Value(GLOBAL const real4* RESTRICT posq, GLOBAL const unsigned int* RESTRICT exclusions,
+        GLOBAL const ushort2* exclusionTiles,
 #ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict global_value,
+        GLOBAL mm_ulong* RESTRICT global_value,
 #else
-        __global real* restrict global_value,
+        GLOBAL real* RESTRICT global_value,
 #endif
-        __local real* restrict local_value,
 #ifdef USE_CUTOFF
-        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
-        __global const real4* restrict blockSize, __global const int* restrict interactingAtoms
+        GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
+        GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms
 #else
        unsigned int numTiles
 #endif
        PARAMETER_ARGUMENTS) {
+    LOCAL real3 local_pos[LOCAL_BUFFER_SIZE];
+    LOCAL real local_value[LOCAL_BUFFER_SIZE];
+    ATOM_PARAMETER_DATA

    // First loop: process tiles that contain exclusions.
    
-    const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
-    const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
+    const int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
+    const int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
    for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
        const ushort2 tileIndices = exclusionTiles[pos];
        const unsigned int x = tileIndices.x;
@@ -35,7 +33,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*

        for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
            unsigned int j = y*TILE_SIZE + localAtomIndex;
-            local_posq[localAtomIndex] = posq[j];
+            local_pos[localAtomIndex] = trimTo3(posq[j]);
            LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
        }
        if (x == y) {
@@ -47,11 +45,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
 #endif
                unsigned int atom1 = x*TILE_SIZE+tgx;
                real value = 0;
-                real4 posq1 = posq[atom1];
+                real3 pos1 = trimTo3(posq[atom1]);
                LOAD_ATOM1_PARAMETERS
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    real4 posq2 = local_posq[j];
-                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                    real3 pos2 = local_pos[j];
+                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -88,7 +86,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*

 #ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset1 = atom1;
-                atom_add(&global_value[offset1], (long) (value*0x100000000));
+                ATOMIC_ADD(&global_value[offset1], (mm_ulong) ((mm_long) (value*0x100000000)));
 #else
                unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                global_value[offset1] += value;
@@ -107,11 +105,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
 #endif
                unsigned int atom1 = x*TILE_SIZE+tgx;
                real value = 0;
-                real4 posq1 = posq[atom1];
+                real3 pos1 = trimTo3(posq[atom1]);
                LOAD_ATOM1_PARAMETERS
                for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                    real4 posq2 = local_posq[j];
-                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                    real3 pos2 = local_pos[j];
+                    real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
 #ifdef USE_PERIODIC
                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -150,7 +148,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*

 #ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset1 = atom1;
-                atom_add(&global_value[offset1], (long) (value*0x100000000));
+                ATOMIC_ADD(&global_value[offset1], (mm_ulong) ((mm_long) (value*0x100000000)));
 #else
                unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                global_value[offset1] += value;
@@ -163,7 +161,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
            for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
 #ifdef SUPPORTS_64_BIT_ATOMICS
                unsigned int offset2 = y*TILE_SIZE+tgx;
-                atom_add(&global_value[offset2], (long) (local_value[tgx]*0x100000000));
+                ATOMIC_ADD(&global_value[offset2], (mm_ulong) ((mm_long) (local_value[tgx]*0x100000000)));
 #else
                unsigned int offset2 = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
                global_value[offset2] += local_value[tgx];
@@ -180,15 +178,15 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
    const unsigned int numTiles = interactionCount[0];
    if (numTiles > maxTiles)
        return; // There wasn't enough memory for the neighbor list.
-    int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
-    int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
+    int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
+    int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
 #else
-    int pos = (int) (get_group_id(0)*(long)numTiles/get_num_groups(0));
-    int end = (int) ((get_group_id(0)+1)*(long)numTiles/get_num_groups(0));
+    int pos = (int) (get_group_id(0)*(mm_long)numTiles/get_num_groups(0));
+    int end = (int) ((get_group_id(0)+1)*(mm_long)numTiles/get_num_groups(0));
 #endif
    int nextToSkip = -1;
    int currentSkipIndex = 0;
-    __local int atomIndices[TILE_SIZE];
+    LOCAL int atomIndices[TILE_SIZE];

    while (pos < end) {
        bool includeTile = true;
@@ -234,7 +232,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
 #endif
                atomIndices[localAtomIndex] = j;
                if (j < PADDED_NUM_ATOMS) {
-                    local_posq[localAtomIndex] = posq[j];
+                    local_pos[localAtomIndex] = trimTo3(posq[j]);
                    LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
                    local_value[localAtomIndex] = 0;
                }
@@ -246,16 +244,16 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*

                real4 blockCenterX = blockCenter[x];
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++)
-                    APPLY_PERIODIC_TO_POS_WITH_CENTER(local_posq[tgx], blockCenterX)
+                    APPLY_PERIODIC_TO_POS_WITH_CENTER(local_pos[tgx], blockCenterX)
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
                    real value = 0;
-                    real4 posq1 = posq[atom1];
-                    APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
+                    real3 pos1 = trimTo3(posq[atom1]);
+                    APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
                    LOAD_ATOM1_PARAMETERS
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        real4 posq2 = local_posq[j];
-                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                        real3 pos2 = local_pos[j];
+                        real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
                        real r2 = dot(delta.xyz, delta.xyz);
                        if (atom1 < NUM_ATOMS && atomIndices[j] < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
                            real invR = RSQRT(r2);
@@ -277,7 +275,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*

 #ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset1 = atom1;
-                    atom_add(&global_value[offset1], (long) (value*0x100000000));
+                    ATOMIC_ADD(&global_value[offset1], (mm_ulong) ((mm_long) (value*0x100000000)));
 #else
                    unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                    global_value[offset1] += value;
@@ -293,11 +291,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
                    real value = 0;
-                    real4 posq1 = posq[atom1];
+                    real3 pos1 = trimTo3(posq[atom1]);
                    LOAD_ATOM1_PARAMETERS
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
-                        real4 posq2 = local_posq[j];
-                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
+                        real3 pos2 = local_pos[j];
+                        real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
 #ifdef USE_PERIODIC
                        APPLY_PERIODIC_TO_DELTA(delta)
 #endif
@@ -326,7 +324,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*

 #ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset1 = atom1;
-                    atom_add(&global_value[offset1], (long) (value*0x100000000));
+                    ATOMIC_ADD(&global_value[offset1], (mm_ulong) ((mm_long) (value*0x100000000)));
 #else
                    unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
                    global_value[offset1] += value;
@@ -346,7 +344,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
                if (atom2 < PADDED_NUM_ATOMS) {
 #ifdef SUPPORTS_64_BIT_ATOMICS
                    unsigned int offset2 = atom2;
-                    atom_add(&global_value[offset2], (long) (local_value[tgx]*0x100000000));
+                    ATOMIC_ADD(&global_value[offset2], (mm_ulong) ((mm_long) (local_value[tgx]*0x100000000)));
 #else
                    unsigned int offset2 = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
                    global_value[offset2] += local_value[tgx];

--- a/platforms/opencl/src/kernels/customGBValuePerParticle.cl
+++ b/platforms/opencl/src/kernels/customGBValuePerParticle.cl
@@ -2,19 +2,18 @@
 * Reduce a pairwise computed value, and compute per-particle values.
 */

-__kernel void computePerParticleValues(int bufferSize, int numBuffers, __global real4* posq,
+KERNEL void computePerParticleValues(GLOBAL real4* posq,
 #ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* valueBuffers
+        GLOBAL mm_long* valueBuffers
 #else
-        __global real* valueBuffers
+        GLOBAL real* valueBuffers, int bufferSize, int numBuffers
 #endif
        PARAMETER_ARGUMENTS) {
-    unsigned int index = get_global_id(0);
-    while (index < NUM_ATOMS) {
+    for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
        // Reduce the pairwise value

 #ifdef SUPPORTS_64_BIT_ATOMICS
-        real sum = (1.0f/0x100000000)*valueBuffers[index];
+        real sum = valueBuffers[index]/(real) 0x100000000;
 #else
        int totalSize = bufferSize*numBuffers;
        real sum = valueBuffers[index];
@@ -27,6 +26,5 @@ __kernel void computePerParticleValues(int bufferSize, int numBuffers, __global

        real4 pos = posq[index];
        COMPUTE_VALUES
-        index += get_global_size(0);
    }
 }
--- a/platforms/opencl/src/kernels/customHbondForce.cl
+++ b/platforms/opencl/src/kernels/customHbondForce.cl
@@ -2,8 +2,8 @@
 * Compute the difference between two vectors, optionally taking periodic boundary conditions into account
 * and setting the fourth component to the squared magnitude.
 */
-real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
-    real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
+inline DEVICE real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
+    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
 #ifdef USE_PERIODIC
    APPLY_PERIODIC_TO_DELTA(result)
 #endif
@@ -14,73 +14,79 @@ real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxS
 /**
 * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
 */
-real computeAngle(real4 vec1, real4 vec2) {
+inline DEVICE real computeAngle(real4 vec1, real4 vec2) {
    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
    real angle;
    if (cosine > 0.99f || cosine < -0.99f) {
        // We're close to the singularity in acos(), so take the cross product and use asin() instead.

-        real4 crossProduct = cross(vec1, vec2);
+        real3 crossProduct = cross(trimTo3(vec1), trimTo3(vec2));
        real scale = vec1.w*vec2.w;
-        angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
-        if (cosine < 0.0f)
-            angle = PI-angle;
+        angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
+        if (cosine < 0)
+            angle = M_PI-angle;
    }
    else
-       angle = acos(cosine);
+       angle = ACOS(cosine);
    return angle;
 }

 /**
 * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
 */
-real4 computeCross(real4 vec1, real4 vec2) {
-    real4 result = cross(vec1, vec2);
-    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
-    return result;
+inline DEVICE real4 computeCross(real4 vec1, real4 vec2) {
+    real3 cp = cross(trimTo3(vec1), trimTo3(vec2));
+    return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
 }

 /**
 * Compute forces on donors.
 */
-__kernel void computeDonorForces(__global real4* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict exclusions,
-        __global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict donorBufferIndices, __local real4* posBuffer, real4 periodicBoxSize, real4 invPeriodicBoxSize,
+KERNEL void computeDonorForces(
+#ifdef SUPPORTS_64_BIT_ATOMICS
+	GLOBAL mm_ulong* RESTRICT force,
+#else
+	GLOBAL real4* RESTRICT forceBuffers, GLOBAL const int4* RESTRICT donorBufferIndices,
+#endif
+	GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions,
+        GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        PARAMETER_ARGUMENTS) {
+    LOCAL real4 posBuffer[3*THREAD_BLOCK_SIZE];
    mixed energy = 0;
-    real4 f1 = (real4) 0;
-    real4 f2 = (real4) 0;
-    real4 f3 = (real4) 0;
-    for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += get_global_size(0)) {
+    real3 f1 = make_real3(0);
+    real3 f2 = make_real3(0);
+    real3 f3 = make_real3(0);
+    for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += GLOBAL_SIZE) {
        // Load information about the donor this thread will compute forces on.

-        int donorIndex = donorStart+get_global_id(0);
+        int donorIndex = donorStart+GLOBAL_ID;
        int4 atoms, exclusionIndices;
        real4 d1, d2, d3;
        if (donorIndex < NUM_DONORS) {
            atoms = donorAtoms[donorIndex];
-            d1 = (atoms.x > -1 ? posq[atoms.x] : (real4) 0);
-            d2 = (atoms.y > -1 ? posq[atoms.y] : (real4) 0);
-            d3 = (atoms.z > -1 ? posq[atoms.z] : (real4) 0);
+            d1 = (atoms.x > -1 ? posq[atoms.x] : make_real4(0));
+            d2 = (atoms.y > -1 ? posq[atoms.y] : make_real4(0));
+            d3 = (atoms.z > -1 ? posq[atoms.z] : make_real4(0));
 #ifdef USE_EXCLUSIONS
            exclusionIndices = exclusions[donorIndex];
 #endif
        }
        else
-            atoms = (int4) (-1, -1, -1, -1);
-        for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += get_local_size(0)) {
+            atoms = make_int4(-1, -1, -1, -1);
+        for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += LOCAL_SIZE) {
            // Load the next block of acceptors into local memory.

-            barrier(CLK_LOCAL_MEM_FENCE);
-            int blockSize = min((int) get_local_size(0), NUM_ACCEPTORS-acceptorStart);
-            if (get_local_id(0) < blockSize) {
-                int4 atoms2 = acceptorAtoms[acceptorStart+get_local_id(0)];
-                posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (real4) 0);
-                posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (real4) 0);
-                posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (real4) 0);
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
+            SYNC_THREADS;
+            int blockSize = min((int) LOCAL_SIZE, NUM_ACCEPTORS-acceptorStart);
+            if (LOCAL_ID < blockSize) {
+                int4 atoms2 = acceptorAtoms[acceptorStart+LOCAL_ID];
+                posBuffer[3*LOCAL_ID] = (atoms2.x > -1 ? posq[atoms2.x] : make_real4(0));
+                posBuffer[3*LOCAL_ID+1] = (atoms2.y > -1 ? posq[atoms2.y] : make_real4(0));
+                posBuffer[3*LOCAL_ID+2] = (atoms2.z > -1 ? posq[atoms2.z] : make_real4(0));
+            }
+            SYNC_THREADS;
            if (donorIndex < NUM_DONORS) {
                for (int index = 0; index < blockSize; index++) {
                    int acceptorIndex = acceptorStart+index;
@@ -108,6 +114,26 @@ __kernel void computeDonorForces(__global real4* restrict forceBuffers, __global
        // Write results

        if (donorIndex < NUM_DONORS) {
+#ifdef SUPPORTS_64_BIT_ATOMICS
+            if (atoms.x > -1) {
+                ATOMIC_ADD(&force[atoms.x], (mm_ulong) ((mm_long) (f1.x*0x100000000)));
+                ATOMIC_ADD(&force[atoms.x+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f1.y*0x100000000)));
+                ATOMIC_ADD(&force[atoms.x+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f1.z*0x100000000)));
+                MEM_FENCE;
+            }
+            if (atoms.y > -1) {
+                ATOMIC_ADD(&force[atoms.y], (mm_ulong) ((mm_long) (f2.x*0x100000000)));
+                ATOMIC_ADD(&force[atoms.y+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f2.y*0x100000000)));
+                ATOMIC_ADD(&force[atoms.y+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f2.z*0x100000000)));
+                MEM_FENCE;
+            }
+            if (atoms.z > -1) {
+                ATOMIC_ADD(&force[atoms.z], (mm_ulong) ((mm_long) (f3.x*0x100000000)));
+                ATOMIC_ADD(&force[atoms.z+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f3.y*0x100000000)));
+                ATOMIC_ADD(&force[atoms.z+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f3.z*0x100000000)));
+                MEM_FENCE;
+            }
+#else
            int4 bufferIndices = donorBufferIndices[donorIndex];
            if (atoms.x > -1) {
                unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
@@ -127,49 +153,57 @@ __kernel void computeDonorForces(__global real4* restrict forceBuffers, __global
                force.xyz += f3.xyz;
                forceBuffers[offset] = force;
            }
+#endif
        }
    }
-    energyBuffer[get_global_id(0)] += energy;
+    energyBuffer[GLOBAL_ID] += energy;
 }
 /**
 * Compute forces on acceptors.
 */
-__kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict exclusions,
-        __global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict acceptorBufferIndices, __local real4* restrict posBuffer, real4 periodicBoxSize, real4 invPeriodicBoxSize,
+KERNEL void computeAcceptorForces(
+#ifdef SUPPORTS_64_BIT_ATOMICS
+	GLOBAL mm_ulong* RESTRICT force,
+#else
+	GLOBAL real4* RESTRICT forceBuffers, GLOBAL const int4* RESTRICT acceptorBufferIndices,
+#endif
+        GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT exclusions,
+        GLOBAL const int4* RESTRICT donorAtoms, GLOBAL const int4* RESTRICT acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        PARAMETER_ARGUMENTS) {
-    real4 f1 = (real4) 0;
-    real4 f2 = (real4) 0;
-    real4 f3 = (real4) 0;
-    for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += get_global_size(0)) {
+    LOCAL real4 posBuffer[3*THREAD_BLOCK_SIZE];
+    real3 f1 = make_real3(0);
+    real3 f2 = make_real3(0);
+    real3 f3 = make_real3(0);
+    for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += GLOBAL_SIZE) {
        // Load information about the acceptor this thread will compute forces on.

-        int acceptorIndex = acceptorStart+get_global_id(0);
+        int acceptorIndex = acceptorStart+GLOBAL_ID;
        int4 atoms, exclusionIndices;
        real4 a1, a2, a3;
        if (acceptorIndex < NUM_ACCEPTORS) {
            atoms = acceptorAtoms[acceptorIndex];
-            a1 = (atoms.x > -1 ? posq[atoms.x] : (real4) 0);
-            a2 = (atoms.y > -1 ? posq[atoms.y] : (real4) 0);
-            a3 = (atoms.z > -1 ? posq[atoms.z] : (real4) 0);
+            a1 = (atoms.x > -1 ? posq[atoms.x] : make_real4(0));
+            a2 = (atoms.y > -1 ? posq[atoms.y] : make_real4(0));
+            a3 = (atoms.z > -1 ? posq[atoms.z] : make_real4(0));
 #ifdef USE_EXCLUSIONS
            exclusionIndices = exclusions[acceptorIndex];
 #endif
        }
        else
-            atoms = (int4) (-1, -1, -1, -1);
-        for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += get_local_size(0)) {
+            atoms = make_int4(-1, -1, -1, -1);
+        for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += LOCAL_SIZE) {
            // Load the next block of donors into local memory.

-            barrier(CLK_LOCAL_MEM_FENCE);
-            int blockSize = min((int) get_local_size(0), NUM_DONORS-donorStart);
-            if (get_local_id(0) < blockSize) {
-                int4 atoms2 = donorAtoms[donorStart+get_local_id(0)];
-                posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (real4) 0);
-                posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (real4) 0);
-                posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (real4) 0);
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
+            SYNC_THREADS;
+            int blockSize = min((int) LOCAL_SIZE, NUM_DONORS-donorStart);
+            if (LOCAL_ID < blockSize) {
+                int4 atoms2 = donorAtoms[donorStart+LOCAL_ID];
+                posBuffer[3*LOCAL_ID] = (atoms2.x > -1 ? posq[atoms2.x] : make_real4(0));
+                posBuffer[3*LOCAL_ID+1] = (atoms2.y > -1 ? posq[atoms2.y] : make_real4(0));
+                posBuffer[3*LOCAL_ID+2] = (atoms2.z > -1 ? posq[atoms2.z] : make_real4(0));
+            }
+            SYNC_THREADS;
            if (acceptorIndex < NUM_ACCEPTORS) {
                for (int index = 0; index < blockSize; index++) {
                    int donorIndex = donorStart+index;
@@ -197,6 +231,26 @@ __kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __glo
        // Write results

        if (acceptorIndex < NUM_ACCEPTORS) {
+#ifdef SUPPORTS_64_BIT_ATOMICS
+            if (atoms.x > -1) {
+                ATOMIC_ADD(&force[atoms.x], (mm_ulong) ((mm_long) (f1.x*0x100000000)));
+                ATOMIC_ADD(&force[atoms.x+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f1.y*0x100000000)));
+                ATOMIC_ADD(&force[atoms.x+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f1.z*0x100000000)));
+                MEM_FENCE;
+            }
+            if (atoms.y > -1) {
+                ATOMIC_ADD(&force[atoms.y], (mm_ulong) ((mm_long) (f2.x*0x100000000)));
+                ATOMIC_ADD(&force[atoms.y+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f2.y*0x100000000)));
+                ATOMIC_ADD(&force[atoms.y+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f2.z*0x100000000)));
+                MEM_FENCE;
+            }
+            if (atoms.z > -1) {
+                ATOMIC_ADD(&force[atoms.z], (mm_ulong) ((mm_long) (f3.x*0x100000000)));
+                ATOMIC_ADD(&force[atoms.z+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f3.y*0x100000000)));
+                ATOMIC_ADD(&force[atoms.z+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (f3.z*0x100000000)));
+                MEM_FENCE;
+            }
+#else
            int4 bufferIndices = acceptorBufferIndices[acceptorIndex];
            if (atoms.x > -1) {
                unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
@@ -216,6 +270,7 @@ __kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __glo
                force.xyz += f3.xyz;
                forceBuffers[offset] = force;
            }
+#endif
        }
    }
 }
--- a/platforms/cuda/src/kernels/customIntegrator.cu
+++ b/platforms/cuda/src/kernels/customIntegrator.cu
-extern "C" __global__ void computeFloatSum(const float* __restrict__ sumBuffer, float* result) {
-    __shared__ float tempBuffer[WORK_GROUP_SIZE];
-    const unsigned int thread = threadIdx.x;
+KERNEL void computeFloatSum(GLOBAL const float* RESTRICT sumBuffer, GLOBAL float* result, int bufferSize) {
+    LOCAL float tempBuffer[WORK_GROUP_SIZE];
+    const unsigned int thread = LOCAL_ID;
    float sum = 0;
-    for (unsigned int index = thread; index < SUM_BUFFER_SIZE; index += blockDim.x)
+    for (unsigned int index = thread; index < bufferSize; index += LOCAL_SIZE)
        sum += sumBuffer[index];
    tempBuffer[thread] = sum;
    for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
-        __syncthreads();
+        SYNC_THREADS;
        if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
            tempBuffer[thread] += tempBuffer[thread+i];
    }
@@ -14,24 +14,26 @@ extern "C" __global__ void computeFloatSum(const float* __restrict__ sumBuffer,
        *result = tempBuffer[0];
 }

-extern "C" __global__ void computeDoubleSum(const double* __restrict__ sumBuffer, double* result) {
-    __shared__ double tempBuffer[WORK_GROUP_SIZE];
-    const unsigned int thread = threadIdx.x;
+#ifdef SUPPORTS_DOUBLE_PRECISION
+KERNEL void computeDoubleSum(GLOBAL const double* RESTRICT sumBuffer, GLOBAL double* result, int bufferSize) {
+    LOCAL double tempBuffer[WORK_GROUP_SIZE];
+    const unsigned int thread = LOCAL_ID;
    double sum = 0;
-    for (unsigned int index = thread; index < SUM_BUFFER_SIZE; index += blockDim.x)
+    for (unsigned int index = thread; index < bufferSize; index += LOCAL_SIZE)
        sum += sumBuffer[index];
    tempBuffer[thread] = sum;
    for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
-        __syncthreads();
+        SYNC_THREADS;
        if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
            tempBuffer[thread] += tempBuffer[thread+i];
    }
    if (thread == 0)
        *result = tempBuffer[0];
 }
+#endif

-extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) {
-    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+KERNEL void applyPositionDeltas(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT posqCorrection, GLOBAL mixed4* RESTRICT posDelta) {
+    for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
 #ifdef USE_MIXED_PRECISION
        real4 pos1 = posq[index];
        real4 pos2 = posqCorrection[index];
@@ -48,14 +50,14 @@ extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4*
 #else
        posq[index] = pos;
 #endif
-        posDelta[index] = make_mixed4(0, 0, 0, 0);
+        posDelta[index] = make_mixed4(0);
    }
 }

-extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restrict__ random, uint4* __restrict__ seed) {
-    uint4 state = seed[blockIdx.x*blockDim.x+threadIdx.x];
+KERNEL void generateRandomNumbers(int numValues, GLOBAL float4* RESTRICT random, GLOBAL uint4* RESTRICT seed) {
+    uint4 state = seed[GLOBAL_ID];
    unsigned int carry = 0;
-    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numValues; index += blockDim.x*gridDim.x) {
+    for (int index = GLOBAL_ID; index < numValues; index += GLOBAL_SIZE) {
        // Generate three uniform random numbers.

        state.x = state.x * 69069 + 1;
@@ -93,5 +95,5 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri

        random[index] = make_float4(x1, x2, x3, 0.0f);
    }
-    seed[blockIdx.x*blockDim.x+threadIdx.x] = state;
+    seed[GLOBAL_ID] = state;
 }
--- a/platforms/common/src/kernels/customIntegratorPerDof.cc
+++ b/platforms/common/src/kernels/customIntegratorPerDof.cc
+#ifdef SUPPORTS_DOUBLE_PRECISION
+typedef double TempType;
+typedef double3 TempType3;
+typedef double4 TempType4;
+
+#define make_TempType3(a...) make_double3(a)
+#define make_TempType4(a...) make_double4(a)
+#define convertToTempType3(a) make_double3((a).x, (a).y, (a).z)
+#define convertToTempType4(a) make_double4((a).x, (a).y, (a).z, (a).w)
+
+inline DEVICE mixed4 convertFromDouble4(double4 a) {
+    return make_mixed4(a.x, a.y, a.z, a.w);
+}
+#else
+typedef float TempType;
+typedef float3 TempType3;
+typedef float4 TempType4;
+
+#define make_TempType3(a...) make_float3(a)
+#define make_TempType4(a...) make_float4(a)
+#define convertToTempType3(a) make_float3((a).x, (a).y, (a).z)
+#define convertToTempType4(a) make_float4((a).x, (a).y, (a).z, (a).w)
+#endif
+
+/**
+ * Load the position of a particle.
+ */
+inline DEVICE TempType4 loadPos(GLOBAL const real4* RESTRICT posq, GLOBAL const real4* RESTRICT posqCorrection, int index) {
+#ifdef USE_MIXED_PRECISION
+    real4 pos1 = posq[index];
+    real4 pos2 = posqCorrection[index];
+    return make_TempType4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
+#else
+    return convertToTempType4(posq[index]);
+#endif
+}
+
+/**
+ * Store the position of a particle.
+ */
+inline DEVICE void storePos(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT posqCorrection, int index, TempType4 pos) {
+#ifdef USE_MIXED_PRECISION
+    posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
+    posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
+#else
+    posq[index] = make_real4(pos.x, pos.y, pos.z, pos.w);
+#endif
+}
+
+KERNEL void computePerDof(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT posqCorrection, GLOBAL mixed4* RESTRICT posDelta,
+        GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL const mixed2* RESTRICT dt, GLOBAL const mixed* RESTRICT globals,
+        GLOBAL mixed* RESTRICT sum, GLOBAL const float4* RESTRICT gaussianValues, unsigned int gaussianBaseIndex, GLOBAL const float4* RESTRICT uniformValues,
+        const mixed energy, GLOBAL mixed* RESTRICT energyParamDerivs
+        PARAMETER_ARGUMENTS) {
+    TempType3 stepSize = make_TempType3(dt[0].y);
+    int index = GLOBAL_ID;
+    const TempType forceScale = ((TempType) 1)/0xFFFFFFFF;
+    while (index < NUM_ATOMS) {
+#ifdef LOAD_POS_AS_DELTA
+        TempType4 position = loadPos(posq, posqCorrection, index) + convertToTempType4(posDelta[index]);
+#else
+        TempType4 position = loadPos(posq, posqCorrection, index);
+#endif
+        TempType4 velocity = convertToTempType4(velm[index]);
+        TempType3 f = make_TempType3(forceScale*force[index], forceScale*force[index+PADDED_NUM_ATOMS], forceScale*force[index+PADDED_NUM_ATOMS*2]);
+        TempType3 mass = make_TempType3(RECIP(velocity.w));
+        if (velocity.w != 0.0) {
+            int gaussianIndex = gaussianBaseIndex;
+            int uniformIndex = 0;
+            COMPUTE_STEP
+        }
+        index += GLOBAL_SIZE;
+    }
+}
--- a/platforms/opencl/src/kernels/customManyParticle.cl
+++ b/platforms/opencl/src/kernels/customManyParticle.cl
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-
 /**
 * Record the force on an atom to global memory.
 */
-inline void storeForce(int atom, real4 force, __global long* restrict forceBuffers) {
-    atom_add(&forceBuffers[atom], (long) (force.x*0x100000000));
-    atom_add(&forceBuffers[atom+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-    atom_add(&forceBuffers[atom+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
+inline DEVICE void storeForce(int atom, real3 force, GLOBAL mm_ulong* RESTRICT forceBuffers) {
+    ATOMIC_ADD(&forceBuffers[atom], (mm_ulong) ((mm_long) (force.x*0x100000000)));
+    ATOMIC_ADD(&forceBuffers[atom+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
+    ATOMIC_ADD(&forceBuffers[atom+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
 }

 /**
 * Compute the difference between two vectors, taking periodic boundary conditions into account
 * and setting the fourth component to the squared magnitude.
 */
-inline real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
-    real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
+inline DEVICE real4 delta(real3 vec1, real3 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
+    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
 #ifdef USE_PERIODIC
    APPLY_PERIODIC_TO_DELTA(result)
 #endif
@@ -26,36 +23,36 @@ inline real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPerio
 /**
 * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
 */
-real computeAngle(real4 vec1, real4 vec2) {
+DEVICE real computeAngle(real4 vec1, real4 vec2) {
    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
    real angle;
    if (cosine > 0.99f || cosine < -0.99f) {
        // We're close to the singularity in acos(), so take the cross product and use asin() instead.

-        real4 crossProduct = cross(vec1, vec2);
+        real3 crossProduct = trimTo3(cross(vec1, vec2));
        real scale = vec1.w*vec2.w;
-        angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
+        angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
        if (cosine < 0.0f)
            angle = M_PI-angle;
    }
    else
-       angle = acos(cosine);
+       angle = ACOS(cosine);
    return angle;
 }

 /**
 * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
 */
-inline real4 computeCross(real4 vec1, real4 vec2) {
-    real4 cp = cross(vec1, vec2);
-    return (real4) (cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
+inline DEVICE real4 computeCross(real4 vec1, real4 vec2) {
+    real3 cp = trimTo3(cross(vec1, vec2));
+    return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
 }

 /**
 * Determine whether a particular interaction is in the list of exclusions.
 */
-inline bool isInteractionExcluded(int atom1, int atom2, __global const int* restrict exclusions, __global const int* restrict exclusionStartIndex) {
+inline DEVICE bool isInteractionExcluded(int atom1, int atom2, GLOBAL const int* RESTRICT exclusions, GLOBAL const int* RESTRICT exclusionStartIndex) {
    if (atom1 > atom2) {
        int temp = atom1;
        atom1 = atom2;
@@ -76,24 +73,24 @@ inline bool isInteractionExcluded(int atom1, int atom2, __global const int* rest
 /**
 * Compute the interaction.
 */
-__kernel void computeInteraction(
-        __global long* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq,
+KERNEL void computeInteraction(
+        GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
 #ifdef USE_CUTOFF
-        , __global const int* restrict neighbors, __global const int* restrict neighborStartIndex
+        , GLOBAL const int* RESTRICT neighbors, GLOBAL const int* RESTRICT neighborStartIndex
 #endif
 #ifdef USE_FILTERS
-        , __global int* restrict particleTypes, __global int* restrict orderIndex, __global int* restrict particleOrder
+        , GLOBAL int* RESTRICT particleTypes, GLOBAL int* RESTRICT orderIndex, GLOBAL int* RESTRICT particleOrder
 #endif
 #ifdef USE_EXCLUSIONS
-        , __global int* restrict exclusions, __global int* restrict exclusionStartIndex
+        , GLOBAL int* RESTRICT exclusions, GLOBAL int* RESTRICT exclusionStartIndex
 #endif
        PARAMETER_ARGUMENTS) {
    mixed energy = 0;
    
    // Loop over particles to be the first one in the set.
    
-    for (int p1 = get_group_id(0); p1 < NUM_ATOMS; p1 += get_num_groups(0)) {
+    for (int p1 = GROUP_ID; p1 < NUM_ATOMS; p1 += NUM_GROUPS) {
 #ifdef USE_CENTRAL_PARTICLE
        const int a1 = p1;
 #else
@@ -110,7 +107,7 @@ __kernel void computeInteraction(
  #endif
 #endif
        int numCombinations = NUM_CANDIDATE_COMBINATIONS;
-        for (int index = get_local_id(0); index < numCombinations; index += get_local_size(0)) {
+        for (int index = LOCAL_ID; index < numCombinations; index += LOCAL_SIZE) {
            FIND_ATOMS_FOR_COMBINATION_INDEX;
            bool includeInteraction = IS_VALID_COMBINATION;
 #ifdef USE_CUTOFF
@@ -135,15 +132,15 @@ __kernel void computeInteraction(
            }
        }
    }
-    energyBuffer[get_global_id(0)] += energy;
+    energyBuffer[GLOBAL_ID] += energy;
 }

 /**
 * Find a bounding box for the atoms in each block.
 */
-__kernel void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        __global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict numNeighborPairs) {
-    int index = get_global_id(0);
+KERNEL void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        GLOBAL const real4* RESTRICT posq, GLOBAL real4* RESTRICT blockCenter, GLOBAL real4* RESTRICT blockBoundingBox, GLOBAL int* RESTRICT numNeighborPairs) {
+    int index = GLOBAL_ID;
    int base = index*TILE_SIZE;
    while (base < NUM_ATOMS) {
        real4 pos = posq[base];
@@ -159,37 +156,39 @@ __kernel void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, r
            real4 center = 0.5f*(maxPos+minPos);
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
 #endif
-            minPos = (real4) (min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
-            maxPos = (real4) (max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
+            minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
+            maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
        }
        real4 blockSize = 0.5f*(maxPos-minPos);
        blockBoundingBox[index] = blockSize;
        blockCenter[index] = 0.5f*(maxPos+minPos);
-        index += get_global_size(0);
+        index += GLOBAL_SIZE;
        base = index*TILE_SIZE;
    }
-    if (get_group_id(0) == 0 && get_local_id(0) == 0)
+    if (GROUP_ID == 0 && LOCAL_ID == 0)
        *numNeighborPairs = 0;
 }

 /**
 * Find a list of neighbors for each atom.
 */
-__kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        __global const real4* restrict posq, __global const real4* restrict blockCenter, __global const real4* restrict blockBoundingBox, __global int2* restrict neighborPairs,
-        __global int* restrict numNeighborPairs, __global int* restrict numNeighborsForAtom, int maxNeighborPairs
+KERNEL void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        GLOBAL const real4* RESTRICT posq, GLOBAL const real4* RESTRICT blockCenter, GLOBAL const real4* RESTRICT blockBoundingBox, GLOBAL int2* RESTRICT neighborPairs,
+        GLOBAL int* RESTRICT numNeighborPairs, GLOBAL int* RESTRICT numNeighborsForAtom, int maxNeighborPairs
 #ifdef USE_EXCLUSIONS
-        , __global const int* restrict exclusions, __global const int* restrict exclusionStartIndex
+        , GLOBAL const int* RESTRICT exclusions, GLOBAL const int* RESTRICT exclusionStartIndex
 #endif
        ) {
-    __local real4 positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE];
-    __local bool includeBlockFlags[FIND_NEIGHBORS_WORKGROUP_SIZE];
-    int indexInWarp = get_local_id(0)%32;
-    int warpStart = get_local_id(0)-indexInWarp;
-    for (int atom1 = get_global_id(0); atom1 < PADDED_NUM_ATOMS; atom1 += get_global_size(0)) {
+    LOCAL real3 positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE];
+    int indexInWarp = LOCAL_ID%32;
+#ifndef __CUDA_ARCH__
+    LOCAL bool includeBlockFlags[FIND_NEIGHBORS_WORKGROUP_SIZE];
+    int warpStart = LOCAL_ID-indexInWarp;
+#endif
+    for (int atom1 = GLOBAL_ID; atom1 < PADDED_NUM_ATOMS; atom1 += GLOBAL_SIZE) {
        // Load data for this atom.  Note that all threads in a warp are processing atoms from the same block.
        
-        real4 pos1 = posq[atom1];
+        real3 pos1 = trimTo3(posq[atom1]);
        int block1 = atom1/TILE_SIZE;
        real4 blockCenter1 = blockCenter[block1];
        real4 blockSize1 = blockBoundingBox[block1];
@@ -221,10 +220,18 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
            
            // Loop over any blocks we identified as potentially containing neighbors.
            
-            includeBlockFlags[get_local_id(0)] = includeBlock2;
+#ifdef __CUDA_ARCH__
+            int includeBlockFlags = BALLOT(includeBlock2);
+            while (includeBlockFlags != 0) {
+                int i = __ffs(includeBlockFlags)-1;
+                includeBlockFlags &= includeBlockFlags-1;
+                {
+#else
+            includeBlockFlags[LOCAL_ID] = includeBlock2;
            SYNC_WARPS;
            for (int i = 0; i < TILE_SIZE; i++) {
                if (includeBlockFlags[warpStart+i]) {
+#endif
                    int block2 = block2Base+i;

                    // Loop over atoms in this block.
@@ -233,12 +240,12 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
                    int included[TILE_SIZE];
                    int numIncluded = 0;
                    SYNC_WARPS;
-                    positionCache[get_local_id(0)] = posq[start+indexInWarp];
+                    positionCache[LOCAL_ID] = trimTo3(posq[start+indexInWarp]);
                    SYNC_WARPS;
                    if (atom1 < NUM_ATOMS) {
                        for (int j = 0; j < 32; j++) {
                            int atom2 = start+j;
-                            real4 pos2 = positionCache[get_local_id(0)-indexInWarp+j];
+                            real3 pos2 = positionCache[LOCAL_ID-indexInWarp+j];

                            // Decide whether to include this atom pair in the neighbor list.

@@ -260,10 +267,10 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
                    // If we found any neighbors, store them to the neighbor list.

                    if (numIncluded > 0) {
-                        int baseIndex = atom_add(numNeighborPairs, numIncluded);
+                        int baseIndex = ATOMIC_ADD(numNeighborPairs, numIncluded);
                        if (baseIndex+numIncluded <= maxNeighborPairs)
                            for (int j = 0; j < numIncluded; j++)
-                                neighborPairs[baseIndex+j] = (int2) (atom1, included[j]);
+                                neighborPairs[baseIndex+j] = make_int2(atom1, included[j]);
                        totalNeighborsForAtom1 += numIncluded;
                    }
                }
@@ -279,59 +286,59 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
 * Sum the neighbor counts to compute the start position of each atom.  This kernel
 * is executed as a single work group.
 */
-__kernel void computeNeighborStartIndices(__global int* restrict numNeighborsForAtom, __global int* restrict neighborStartIndex,
-            __global int* restrict numNeighborPairs, int maxNeighborPairs) {
-    __local unsigned int posBuffer[256];
+KERNEL void computeNeighborStartIndices(GLOBAL int* RESTRICT numNeighborsForAtom, GLOBAL int* RESTRICT neighborStartIndex,
+            GLOBAL int* RESTRICT numNeighborPairs, int maxNeighborPairs) {
+    LOCAL unsigned int posBuffer[256];
    if (*numNeighborPairs > maxNeighborPairs) {
        // There wasn't enough memory for the neighbor list, so we'll need to rebuild it.  Set the neighbor start
        // indices to indicate no neighbors for any atom.
        
-        for (int i = get_local_id(0); i <= NUM_ATOMS; i += get_local_size(0))
+        for (int i = LOCAL_ID; i <= NUM_ATOMS; i += LOCAL_SIZE)
            neighborStartIndex[i] = 0;
        return;
    }
    unsigned int globalOffset = 0;
-    for (unsigned int startAtom = 0; startAtom < NUM_ATOMS; startAtom += get_local_size(0)) {
+    for (unsigned int startAtom = 0; startAtom < NUM_ATOMS; startAtom += LOCAL_SIZE) {
        // Load the neighbor counts into local memory.

-        unsigned int globalIndex = startAtom+get_local_id(0);
-        posBuffer[get_local_id(0)] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0);
-        barrier(CLK_LOCAL_MEM_FENCE);
+        unsigned int globalIndex = startAtom+LOCAL_ID;
+        posBuffer[LOCAL_ID] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0);
+        SYNC_THREADS;

        // Perform a parallel prefix sum.

-        for (unsigned int step = 1; step < get_local_size(0); step *= 2) {
-            unsigned int add = (get_local_id(0) >= step ? posBuffer[get_local_id(0)-step] : 0);
-            barrier(CLK_LOCAL_MEM_FENCE);
-            posBuffer[get_local_id(0)] += add;
-            barrier(CLK_LOCAL_MEM_FENCE);
+        for (unsigned int step = 1; step < LOCAL_SIZE; step *= 2) {
+            unsigned int add = (LOCAL_ID >= step ? posBuffer[LOCAL_ID-step] : 0);
+            SYNC_THREADS;
+            posBuffer[LOCAL_ID] += add;
+            SYNC_THREADS;
        }

        // Write the results back to global memory.

        if (globalIndex < NUM_ATOMS) {
-            neighborStartIndex[globalIndex+1] = posBuffer[get_local_id(0)]+globalOffset;
+            neighborStartIndex[globalIndex+1] = posBuffer[LOCAL_ID]+globalOffset;
            numNeighborsForAtom[globalIndex] = 0; // Clear this so the next kernel can use it as a counter
        }
-        globalOffset += posBuffer[get_local_size(0)-1];
-        barrier(CLK_LOCAL_MEM_FENCE);
+        globalOffset += posBuffer[LOCAL_SIZE-1];
+        SYNC_THREADS;
    }
-    if (get_local_id(0) == 0)
+    if (LOCAL_ID == 0)
        neighborStartIndex[0] = 0;
 }

 /**
 * Assemble the final neighbor list.
 */
-__kernel void copyPairsToNeighborList(__global const int2* restrict neighborPairs, __global int* restrict neighbors, __global int* restrict numNeighborPairs,
-            int maxNeighborPairs, __global int* restrict numNeighborsForAtom, __global const int* restrict neighborStartIndex) {
+KERNEL void copyPairsToNeighborList(GLOBAL const int2* RESTRICT neighborPairs, GLOBAL int* RESTRICT neighbors, GLOBAL int* RESTRICT numNeighborPairs,
+            int maxNeighborPairs, GLOBAL int* RESTRICT numNeighborsForAtom, GLOBAL const int* RESTRICT neighborStartIndex) {
    int actualPairs = *numNeighborPairs;
    if (actualPairs > maxNeighborPairs)
        return; // There wasn't enough memory for the neighbor list, so we'll need to rebuild it.
-    for (unsigned int index = get_global_id(0); index < actualPairs; index += get_global_size(0)) {
+    for (unsigned int index = GLOBAL_ID; index < actualPairs; index += GLOBAL_SIZE) {
        int2 pair = neighborPairs[index];
        int startIndex = neighborStartIndex[pair.x];
-        int offset = atom_add(numNeighborsForAtom+pair.x, 1);
+        int offset = ATOMIC_ADD(numNeighborsForAtom+pair.x, 1);
        neighbors[startIndex+offset] = pair.y;
    }
 }