Common compute framework to unify CUDA and OpenCL code (#2488)

* Began creating common compute framework to unify code between CUDA and OpenCL * Began OpenCL implementation of common compute framework * Common implementation of CMMotionRemover * CUDA implementation of common compute interface * Converted HarmonicBondForce to common compute API * Converted standard bonded forces to common compute API * Converted ExpressionUtilities to common compute API * Created ComputeParameterSet * Converted custom bonded forces to common compute API * Converted CustomCentroidBondForce to common compute API * Converted CustomManyParticleForce to common compute API * Moved lots of duplicate code from CudaContext and OpenCLContext to ComputeContext * Converted GayBerneForce to common compute API * Removed obsolete kernels * Converted verlet integrators to common compute API * Converted Langevin and Brownian integrators to common compute API * Converted CustomIntegrator to common compute API * Converted CustomNonbondedForce to common compute API * Removed uses of a deprecated API * Fixed failing test cases * Converted GBSAOBCForce to common compute API * Began converting CustomGBForce to common compute API * Finished converting CustomGBForce to common compute API * Merged duplicated code in CudaIntegrationUtilities and OpenCLIntegrationUtilities * Converted RMSDForce and AndersenThermostat to common compute API * Converted CustomHbondForce to common compute API * Merged scripts for encoding kernel sources * Converted Drude plugin to common compute API * Fixed errors in CMake scripts * Attempt at fixing errors on Windows * Added discussion of common compute API to developer guide * Added Windows export macro for common classes * Fixed error in CMMotionRemover * Ubdated travis to newer Ubuntu version * Fixed errors on CPU OpenCL * Fixed Windows linking errors * Added missing pragma for 32 bit atomics * Replaced long long with mm_long * More fixes to Windows linking * Bug fix

Common compute framework to unify CUDA and OpenCL code (#2488)
* Began creating common compute framework to unify code between CUDA and OpenCL * Began OpenCL implementation of common compute framework * Common implementation of CMMotionRemover * CUDA implementation of common compute interface * Converted HarmonicBondForce to common compute API * Converted standard bonded forces to common compute API * Converted ExpressionUtilities to common compute API * Created ComputeParameterSet * Converted custom bonded forces to common compute API * Converted CustomCentroidBondForce to common compute API * Converted CustomManyParticleForce to common compute API * Moved lots of duplicate code from CudaContext and OpenCLContext to ComputeContext * Converted GayBerneForce to common compute API * Removed obsolete kernels * Converted verlet integrators to common compute API * Converted Langevin and Brownian integrators to common compute API * Converted CustomIntegrator to common compute API * Converted CustomNonbondedForce to common compute API * Removed uses of a deprecated API * Fixed failing test cases * Converted GBSAOBCForce to common compute API * Began converting CustomGBForce to common compute API * Finished converting CustomGBForce to common compute API * Merged duplicated code in CudaIntegrationUtilities and OpenCLIntegrationUtilities * Converted RMSDForce and AndersenThermostat to common compute API * Converted CustomHbondForce to common compute API * Merged scripts for encoding kernel sources * Converted Drude plugin to common compute API * Fixed errors in CMake scripts * Attempt at fixing errors on Windows * Added discussion of common compute API to developer guide * Added Windows export macro for common classes * Fixed error in CMMotionRemover * Ubdated travis to newer Ubuntu version * Fixed errors on CPU OpenCL * Fixed Windows linking errors * Added missing pragma for 32 bit atomics * Replaced long long with mm_long * More fixes to Windows linking * Bug fix
edbc8407 · peastman · GitHub · 38beeefe · 38beeefe · 38beeefe
Unverified Commit edbc8407 authored Jan 08, 2020 by peastman Committed by GitHub Jan 08, 2020
20 changed files
--- a/platforms/opencl/src/kernels/customGBChainRule.cl
+++ b/platforms/opencl/src/kernels/customGBChainRule.cl
-#ifdef USE_CUTOFF
-if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED) {
-#else
-if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
-#endif
-#ifdef USE_SYMMETRIC
-    real tempForce = 0.0f;
-#else
-    real4 tempForce1 = (real4) 0;
-    real4 tempForce2 = (real4) 0;
-#endif
-    COMPUTE_FORCE
-#ifdef USE_SYMMETRIC
-    dEdR += tempForce*invR;
-#else
-    dEdR1 += tempForce1;
-    dEdR2 += tempForce2;
-#endif
-}
--- a/platforms/opencl/src/kernels/customGBGradientChainRule.cl
+++ b/platforms/opencl/src/kernels/customGBGradientChainRule.cl
-/**
- * Compute chain rule terms for computed values that depend explicitly on particle coordinates.
- */
-
-__kernel void computeGradientChainRuleTerms(__global real4* restrict forceBuffers, __global const real4* restrict posq
-        PARAMETER_ARGUMENTS) {
-    INIT_PARAM_DERIVS
-    unsigned int index = get_global_id(0);
-    while (index < NUM_ATOMS) {
-        real4 pos = posq[index];
-        real4 force = forceBuffers[index];
-        COMPUTE_FORCES
-        forceBuffers[index] = force;
-        index += get_global_size(0);
-    }
-    SAVE_PARAM_DERIVS
-}
--- a/platforms/opencl/src/kernels/customIntegrator.cl
+++ b/platforms/opencl/src/kernels/customIntegrator.cl
-__kernel void computeFloatSum(__global const float* restrict sumBuffer, __global float* result, int bufferSize) {
-    __local float tempBuffer[WORK_GROUP_SIZE];
-    const unsigned int thread = get_local_id(0);
-    float sum = 0;
-    for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
-        sum += sumBuffer[index];
-    tempBuffer[thread] = sum;
-    for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
-            tempBuffer[thread] += tempBuffer[thread+i];
-    }
-    if (thread == 0)
-        *result = tempBuffer[0];
-}
-
-#ifdef SUPPORTS_DOUBLE_PRECISION
-__kernel void computeDoubleSum(__global const double* restrict sumBuffer, __global double* result, int bufferSize) {
-    __local double tempBuffer[WORK_GROUP_SIZE];
-    const unsigned int thread = get_local_id(0);
-    double sum = 0;
-    for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
-        sum += sumBuffer[index];
-    tempBuffer[thread] = sum;
-    for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
-            tempBuffer[thread] += tempBuffer[thread+i];
-    }
-    if (thread == 0)
-        *result = tempBuffer[0];
-}
-#endif
-
-__kernel void applyPositionDeltas(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta) {
-    for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
-#ifdef USE_MIXED_PRECISION
-        real4 pos1 = posq[index];
-        real4 pos2 = posqCorrection[index];
-        mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-#else
-        real4 pos = posq[index];
-#endif
-        pos.xyz += posDelta[index].xyz;
-#ifdef USE_MIXED_PRECISION
-        posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
-        posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
-#else
-        posq[index] = pos;
-#endif
-        posDelta[index] = (mixed4) 0;
-    }
-}
-
-__kernel void generateRandomNumbers(int numValues, __global float4* restrict random, __global uint4* restrict seed) {
-    uint4 state = seed[get_global_id(0)];
-    unsigned int carry = 0;
-    for (int index = get_global_id(0); index < numValues; index += get_global_size(0)) {
-        // Generate three uniform random numbers.
-
-        state.x = state.x * 69069 + 1;
-        state.y ^= state.y << 13;
-        state.y ^= state.y >> 17;
-        state.y ^= state.y << 5;
-        unsigned int k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-        unsigned int m = state.w + state.w + state.z + carry;
-        state.z = state.w;
-        state.w = m;
-        carry = k >> 30;
-        float x1 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
-        state.x = state.x * 69069 + 1;
-        state.y ^= state.y << 13;
-        state.y ^= state.y >> 17;
-        state.y ^= state.y << 5;
-        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-        m = state.w + state.w + state.z + carry;
-        state.z = state.w;
-        state.w = m;
-        carry = k >> 30;
-        float x2 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
-        state.x = state.x * 69069 + 1;
-        state.y ^= state.y << 13;
-        state.y ^= state.y >> 17;
-        state.y ^= state.y << 5;
-        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-        m = state.w + state.w + state.z + carry;
-        state.z = state.w;
-        state.w = m;
-        carry = k >> 30;
-        float x3 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
-
-        // Record the values.
-
-        random[index] = (float4) (x1, x2, x3, 0.0f);
-    }
-    seed[get_global_id(0)] = state;
-}
--- a/platforms/opencl/src/kernels/customIntegratorPerDof.cl
+++ b/platforms/opencl/src/kernels/customIntegratorPerDof.cl
-/**
- * Load the position of a particle.
- */
-mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
-#ifdef USE_MIXED_PRECISION
-    real4 pos1 = posq[index];
-    real4 pos2 = posqCorrection[index];
-    return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-#else
-    return posq[index];
-#endif
-}
-
-/**
- * Store the position of a particle.
- */
-void storePos(__global real4* restrict posq, __global real4* restrict posqCorrection, int index, mixed4 pos) {
-#ifdef USE_MIXED_PRECISION
-    posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
-    posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
-#else
-    posq[index] = pos;
-#endif
-}
-
-__kernel void computePerDof(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta,
-        __global mixed4* restrict velm, __global const real4* restrict force, __global const mixed2* restrict dt, __global const mixed* restrict globals,
-        __global mixed* restrict sum, __global const float4* restrict gaussianValues, unsigned int gaussianBaseIndex, __global const float4* restrict uniformValues,
-        const mixed energy, __global mixed* restrict energyParamDerivs
-        PARAMETER_ARGUMENTS) {
-    mixed stepSize = dt[0].y;
-    int index = get_global_id(0);
-    while (index < NUM_ATOMS) {
-#ifdef LOAD_POS_AS_DELTA
-        mixed4 position = loadPos(posq, posqCorrection, index)+posDelta[index];
-#else
-        mixed4 position = loadPos(posq, posqCorrection, index);
-#endif
-        mixed4 velocity = velm[index];
-        mixed4 f = convert_mixed4(force[index]);
-        mixed mass = 1/velocity.w;
-        if (velocity.w != 0.0) {
-            int gaussianIndex = gaussianBaseIndex;
-            int uniformIndex = 0;
-            COMPUTE_STEP
-        }
-        index += get_global_size(0);
-    }
-}
--- a/platforms/opencl/src/kernels/customNonbonded.cl
+++ b/platforms/opencl/src/kernels/customNonbonded.cl
-#ifdef USE_CUTOFF
-if (!isExcluded && r2 < CUTOFF_SQUARED) {
-#else
-if (!isExcluded) {
-#endif
-    real tempForce = 0.0f;
-    real switchValue = 1, switchDeriv = 0;
-#if USE_SWITCH
-    if (r > SWITCH_CUTOFF) {
-        real x = r-SWITCH_CUTOFF;
-        switchValue = 1+x*x*x*(SWITCH_C3+x*(SWITCH_C4+x*SWITCH_C5));
-        switchDeriv = x*x*(3*SWITCH_C3+x*(4*SWITCH_C4+x*5*SWITCH_C5));
-    }
-#endif
-    COMPUTE_FORCE
-#if USE_SWITCH
-    tempForce = tempForce*switchValue - customEnergy*switchDeriv;
-    tempEnergy += customEnergy*switchValue;
-#else
-    tempEnergy += customEnergy;
-#endif
-    dEdR += tempForce*invR;
-}
--- a/platforms/opencl/src/kernels/gayBerne.cl
+++ b/platforms/opencl/src/kernels/gayBerne.cl
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-
-#define TILE_SIZE 32
-#define NEIGHBOR_BLOCK_SIZE 32
-
-/**
- * Calculate the ellipsoid coordinate frames and associated matrices.
- */
-__kernel void computeEllipsoidFrames(int numParticles, __global const real4* restrict posq, __global int2* const restrict axisParticleIndices,
-        __global const float4* restrict sigParams, __global const float4* restrict scale, __global real* restrict aMatrix,
-        __global real* restrict bMatrix, __global real* restrict gMatrix, __global const int* sortedParticles) {
-    for (int sortedIndex = get_global_id(0); sortedIndex < numParticles; sortedIndex += get_global_size(0)) {
-        // Compute the local coordinate system of the ellipsoid;
-
-        int originalIndex = sortedParticles[sortedIndex];
-        real3 pos = posq[originalIndex].xyz;
-        int2 axisParticles = axisParticleIndices[originalIndex];
-        real3 xdir, ydir, zdir;
-        if (axisParticles.x == -1) {
-            xdir = (real3) (1, 0, 0);
-            ydir = (real3) (0, 1, 0);
-        }
-        else {
-            xdir = pos-posq[axisParticles.x].xyz;
-            xdir = normalize(xdir);
-            if (axisParticles.y == -1) {
-                if (xdir.y > -0.5f && xdir.y < 0.5f)
-                    ydir = (real3) (0, 1, 0);
-                else
-                    ydir = (real3) (1, 0, 0);
-            }
-            else
-                ydir = pos-posq[axisParticles.y].xyz;
-            ydir -= xdir*dot(xdir, ydir);
-            ydir = normalize(ydir);
-        }
-        zdir = cross(xdir, ydir);
-
-        // Compute matrices we will need later.
-
-        __global real (*a)[3] = (__global real (*)[3]) (aMatrix+sortedIndex*9);
-        __global real (*b)[3] = (__global real (*)[3]) (bMatrix+sortedIndex*9);
-        __global real (*g)[3] = (__global real (*)[3]) (gMatrix+sortedIndex*9);
-        a[0][0] = xdir.x;
-        a[0][1] = xdir.y;
-        a[0][2] = xdir.z;
-        a[1][0] = ydir.x;
-        a[1][1] = ydir.y;
-        a[1][2] = ydir.z;
-        a[2][0] = zdir.x;
-        a[2][1] = zdir.y;
-        a[2][2] = zdir.z;
-        float4 sig = sigParams[originalIndex];
-        float3 r2 = sig.yzw;
-        float3 e2 = scale[originalIndex].xyz;
-        for (int i = 0; i < 3; i++)
-            for (int j = 0; j < 3; j++) {
-                b[i][j] = a[0][i]*e2.x*a[0][j] + a[1][i]*e2.y*a[1][j] + a[2][i]*e2.z*a[2][j];
-                g[i][j] = a[0][i]*r2.x*a[0][j] + a[1][i]*r2.y*a[1][j] + a[2][i]*r2.z*a[2][j];
-            }
-    }
-}
-
-/**
- * Find a bounding box for the atoms in each block.
- */
-__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        __global const int* sortedAtoms, __global const real4* restrict posq, __global real4* restrict sortedPos, __global real4* restrict blockCenter,
-        __global real4* restrict blockBoundingBox, __global int* restrict neighborBlockCount) {
-    int index = get_global_id(0);
-    int base = index*TILE_SIZE;
-    while (base < numAtoms) {
-        real4 pos = posq[sortedAtoms[base]];
-        sortedPos[base] = pos;
-#ifdef USE_PERIODIC
-        APPLY_PERIODIC_TO_POS(pos)
-#endif
-        real4 minPos = pos;
-        real4 maxPos = pos;
-        int last = min(base+TILE_SIZE, numAtoms);
-        for (int i = base+1; i < last; i++) {
-            pos = posq[sortedAtoms[i]];
-            sortedPos[i] = pos;
-#ifdef USE_PERIODIC
-            real4 center = 0.5f*(maxPos+minPos);
-            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
-#endif
-            minPos = min(minPos, pos);
-            maxPos = max(maxPos, pos);
-        }
-        real4 blockSize = 0.5f*(maxPos-minPos);
-        blockBoundingBox[index] = blockSize;
-        blockCenter[index] = 0.5f*(maxPos+minPos);
-        index += get_global_size(0);
-        base = index*TILE_SIZE;
-    }
-    if (get_global_id(0) == 0)
-        *neighborBlockCount = 0;
-}
-
-/**
- * This is called by findNeighbors() to write a block to the neighbor list.
- */
-void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuffer, int maxNeighborBlocks, __global int* restrict neighbors,
-        __global int* restrict neighborIndex, __global int* restrict neighborBlockCount) {
-    int blockIndex = atom_add(neighborBlockCount, 1);
-    if (blockIndex >= maxNeighborBlocks)
-        return; // We don't have enough room for the neighbor list.
-    neighborIndex[blockIndex] = atom1;
-    int baseIndex = blockIndex*NEIGHBOR_BLOCK_SIZE;
-    for (int i = 0; i < numAtomsInBuffer; i++)
-        neighbors[baseIndex+i] = neighborBuffer[i];
-    for (int i = numAtomsInBuffer; i < NEIGHBOR_BLOCK_SIZE; i++)
-        neighbors[baseIndex+i] = -1;
-}
-
-/**
- * Build a list of neighbors for each atom.
- */
-__kernel void findNeighbors(int numAtoms, int maxNeighborBlocks, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        __global real4* restrict sortedPos, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict neighbors,
-        __global int* restrict neighborIndex, __global int* restrict neighborBlockCount, __global const int* restrict exclusions, __global const int* restrict exclusionStartIndex) {
-    const int numBlocks = (numAtoms+TILE_SIZE-1)/TILE_SIZE;
-    int neighborBuffer[NEIGHBOR_BLOCK_SIZE];
-    for (int atom1 = get_global_id(0); atom1 < numAtoms; atom1 += get_global_size(0)) {
-        int nextExclusion = exclusionStartIndex[atom1];
-        int lastExclusion = exclusionStartIndex[atom1+1];
-        real4 pos = sortedPos[atom1];
-        int nextBufferIndex = 0;
-        
-        // Loop over atom blocks and compute the distance of this atom from each one's bounding box.
-        
-        for (int block = (atom1+1)/TILE_SIZE; block < numBlocks; block++) {
-            real4 center = blockCenter[block];
-            real4 blockSize = blockBoundingBox[block];
-            real4 blockDelta = center-pos;
-#ifdef USE_PERIODIC
-            APPLY_PERIODIC_TO_DELTA(blockDelta)
-#endif
-            blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSize.x);
-            blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSize.y);
-            blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSize.z);
-            if (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z >= CUTOFF_SQUARED)
-                continue;
-            
-            // Loop over atoms within this block.
-            
-            int first = max(block*TILE_SIZE, atom1+1);
-            int last = min((block+1)*TILE_SIZE, numAtoms);
-            for (int atom2 = first; atom2 < last; atom2++) {
-                // Skip over excluded interactions.
-
-                if (nextExclusion < lastExclusion && exclusions[nextExclusion] >= atom2) {
-                    nextExclusion++;
-                    continue;
-                }
-                real4 delta = pos-sortedPos[atom2];
-#ifdef USE_PERIODIC
-                APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                if (r2 < CUTOFF_SQUARED) {
-                    neighborBuffer[nextBufferIndex++] = atom2;
-                    if (nextBufferIndex == NEIGHBOR_BLOCK_SIZE) {
-                        storeNeighbors(atom1, neighborBuffer, nextBufferIndex, maxNeighborBlocks, neighbors, neighborIndex, neighborBlockCount);
-                        nextBufferIndex = 0;
-                    }
-                }
-            }
-        }
-        if (nextBufferIndex > 0)
-            storeNeighbors(atom1, neighborBuffer, nextBufferIndex, maxNeighborBlocks, neighbors, neighborIndex, neighborBlockCount);
-    }
-}
-
-typedef struct {
-    float4 sig;
-    float2 eps;
-    real3 pos;
-    real a[3][3], b[3][3], g[3][3];
-} AtomData;
-
-void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, __global const real4* restrict pos, __global const float4* restrict sigParams,
-        __global const float2* restrict epsParams, __global const real* restrict aMatrix, __global const real* restrict bMatrix, __global const real* restrict gMatrix) {
-    data->sig = sigParams[originalIndex];
-    data->eps = epsParams[originalIndex];
-    data->pos = pos[sortedIndex].xyz;
-    for (int i = 0; i < 3; i++)
-        for (int j = 0; j < 3; j++) {
-            int k = 9*sortedIndex+3*i+j;
-            data->a[i][j] = aMatrix[k];
-            data->b[i][j] = bMatrix[k];
-            data->g[i][j] = gMatrix[k];
-        }
-}
-
-real3 matrixVectorProduct(real (*m)[3], real3 v) {
-    return (real3) (m[0][0]*v.x + m[0][1]*v.y + m[0][2]*v.z,
-                    m[1][0]*v.x + m[1][1]*v.y + m[1][2]*v.z,
-                    m[2][0]*v.x + m[2][1]*v.y + m[2][2]*v.z);
-}
-
-real3 vectorMatrixProduct(real3 v, real (*m)[3]) {
-    return (real3) (m[0][0]*v.x + m[1][0]*v.y + m[2][0]*v.z,
-                    m[0][1]*v.x + m[1][1]*v.y + m[2][1]*v.z,
-                    m[0][2]*v.x + m[1][2]*v.y + m[2][2]*v.z);
-}
-
-
-void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) {
-    result[0][0] = a[0][0]+b[0][0];
-    result[0][1] = a[0][1]+b[0][1];
-    result[0][2] = a[0][2]+b[0][2];
-    result[1][0] = a[1][0]+b[1][0];
-    result[1][1] = a[1][1]+b[1][1];
-    result[1][2] = a[1][2]+b[1][2];
-    result[2][0] = a[2][0]+b[2][0];
-    result[2][1] = a[2][1]+b[2][1];
-    result[2][2] = a[2][2]+b[2][2];
-}
-
-real determinant(real (*m)[3]) {
-    return (m[0][0]*m[1][1]*m[2][2] + m[0][1]*m[1][2]*m[2][0] + m[0][2]*m[1][0]*m[2][1] -
-            m[0][0]*m[1][2]*m[2][1] - m[0][1]*m[1][0]*m[2][2] - m[0][2]*m[1][1]*m[2][0]);
-}
-
-
-void matrixInverse(real (*result)[3], real (*m)[3]) {
-    real invDet = RECIP(determinant(m));
-    result[0][0] = invDet*(m[1][1]*m[2][2] - m[1][2]*m[2][1]);
-    result[1][0] = -invDet*(m[1][0]*m[2][2] - m[1][2]*m[2][0]);
-    result[2][0] = invDet*(m[1][0]*m[2][1] - m[1][1]*m[2][0]);
-    result[0][1] = -invDet*(m[0][1]*m[2][2] - m[0][2]*m[2][1]);
-    result[1][1] = invDet*(m[0][0]*m[2][2] - m[0][2]*m[2][0]);
-    result[2][1] = -invDet*(m[0][0]*m[2][1] - m[0][1]*m[2][0]);
-    result[0][2] = invDet*(m[0][1]*m[1][2] - m[0][2]*m[1][1]);
-    result[1][2] = -invDet*(m[0][0]*m[1][2] - m[0][2]*m[1][0]);
-    result[2][2] = invDet*(m[0][0]*m[1][1] - m[0][1]*m[1][0]);
-}
-
-void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real epsilon, real3 dr, real r2, real3* force1, real3* force2, real3* torque1, real3* torque2, mixed *totalEnergy) {
-    real rInv = RSQRT(r2);
-    real r = r2*rInv;
-    real3 drUnit = dr*rInv;
-    
-    // Compute the switching function.
-
-    real switchValue = 1, switchDeriv = 0;
-    #if USE_SWITCH
-    if (r > SWITCH_CUTOFF) {
-        real x = r-SWITCH_CUTOFF;
-        switchValue = 1+x*x*x*(SWITCH_C3+x*(SWITCH_C4+x*SWITCH_C5));
-        switchDeriv = x*x*(3*SWITCH_C3+x*(4*SWITCH_C4+x*5*SWITCH_C5));
-    }
-    #endif
-
-    // Compute vectors and matrices we'll be needing.
-
-    real B12[3][3], G12[3][3], B12inv[3][3], G12inv[3][3];
-    matrixSum(B12, data1->b, data2->b);
-    matrixSum(G12, data1->g, data2->g);
-    matrixInverse(B12inv, B12);
-    matrixInverse(G12inv, G12);
-    real detG12 = determinant(G12);
-
-    // Estimate the distance between the ellipsoids and compute the first terms needed for the energy.
-
-    real sigma12 = 1/SQRT(0.5f*dot(drUnit, matrixVectorProduct(G12inv, drUnit)));
-    real h12 = r - sigma12;
-    real rho = sigma/(h12+sigma);
-    real rho2 = rho*rho;
-    real rho6 = rho2*rho2*rho2;
-    real u = 4*epsilon*(rho6*rho6-rho6);
-    real eta = SQRT(2*data1->eps.y*data2->eps.y/detG12);
-    real chi = 2*dot(drUnit, matrixVectorProduct(B12inv, drUnit));
-    chi *= chi;
-    real energy = u*eta*chi;
-    
-    // Compute the terms needed for the force.
-
-    real3 kappa = matrixVectorProduct(G12inv, dr);
-    real3 iota = matrixVectorProduct(B12inv, dr);
-    real rInv2 = rInv*rInv;
-    real dUSLJdr = 24*epsilon*(2*rho6-1)*rho6*rho/sigma;
-    real temp = 0.5f*sigma12*sigma12*sigma12*rInv2;
-    real3 dudr = (drUnit + (kappa-drUnit*dot(kappa, drUnit))*temp)*dUSLJdr;
-    real3 dchidr = (iota-drUnit*dot(iota, drUnit))*(-8*rInv2*SQRT(chi));
-    real3 force = (dchidr*u + dudr*chi)*(eta*switchValue) - drUnit*(energy*switchDeriv);
-    *force1 += force;
-    *force2 -= force;
-
-    // Compute the terms needed for the torque.
-
-    for (int j = 0; j < 2; j++) {
-        real (*a)[3] = (j == 0 ? data1->a : data2->a);
-        real (*b)[3] = (j == 0 ? data1->b : data2->b);
-        real (*g)[3] = (j == 0 ? data1->g : data2->g);
-        float4 sig = (j == 0 ? data1->sig : data2->sig);
-        real3 dudq = cross(vectorMatrixProduct(kappa, g), kappa*(temp*dUSLJdr));
-        real3 dchidq = cross(vectorMatrixProduct(iota, b), iota)*(-4*rInv2);
-        real3 scale = (real3) (sig.y, sig.z, sig.w)*(-0.5f*eta/detG12);
-        real d[3][3];
-        d[0][0] = scale.x*(2*a[0][0]*(G12[1][1]*G12[2][2] - G12[1][2]*G12[2][1]) +
-                             a[0][2]*(G12[1][2]*G12[0][1] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])) +
-                             a[0][1]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])));
-        d[0][1] = scale.x*(  a[0][0]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])) +
-                           2*a[0][1]*(G12[0][0]*G12[2][2] - G12[2][0]*G12[0][2]) +
-                             a[0][2]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])));
-        d[0][2] = scale.x*(  a[0][0]*(G12[0][1]*G12[1][2] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])) +
-                             a[0][1]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])) +
-                           2*a[0][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
-        d[1][0] = scale.y*(2*a[1][0]*(G12[1][1]*G12[2][2] - G12[1][2]*G12[2][1]) +
-                             a[1][1]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])) +
-                             a[1][2]*(G12[1][2]*G12[0][1] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])));
-        d[1][1] = scale.y*(  a[1][0]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])) +
-                           2*a[1][1]*(G12[2][2]*G12[0][0] - G12[2][0]*G12[0][2]) +
-                             a[1][2]*(G12[1][0]*G12[0][2] + G12[0][1]*G12[2][0] - G12[0][0]*(G12[1][2] + G12[2][1])));
-        d[1][2] = scale.y*(  a[1][0]*(G12[0][1]*G12[1][2] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])) +
-                             a[1][1]*(G12[1][0]*G12[0][2] + G12[0][1]*G12[2][0] - G12[0][0]*(G12[1][2] + G12[2][1])) +
-                           2*a[1][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
-        d[2][0] = scale.z*(2*a[2][0]*(G12[1][1]*G12[2][2] - G12[2][1]*G12[1][2]) +
-                             a[2][1]*(G12[0][2]*G12[2][1] + G12[1][2]*G12[2][0] - G12[2][2]*(G12[0][1] + G12[1][0])) +
-                             a[2][2]*(G12[0][1]*G12[1][2] + G12[2][1]*G12[1][0] - G12[1][1]*(G12[0][2] + G12[2][0])));
-        d[2][1] = scale.z*(  a[2][0]*(G12[0][2]*G12[2][1] + G12[1][2]*G12[2][0] - G12[2][2]*(G12[0][1] + G12[1][0])) +
-                           2*a[2][1]*(G12[0][0]*G12[2][2] - G12[0][2]*G12[2][0]) +
-                             a[2][2]*(G12[1][0]*G12[0][2] + G12[0][1]*G12[2][0] - G12[0][0]*(G12[1][2] + G12[2][1])));
-        d[2][2] = scale.z*(  a[2][0]*(G12[0][1]*G12[1][2] + G12[2][1]*G12[1][0] - G12[1][1]*(G12[0][2] + G12[2][0])) +
-                             a[2][1]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])) +
-                           2*a[2][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
-        real3 detadq = 0;
-        for (int i = 0; i < 3; i++)
-            detadq += cross((real3) (a[i][0], a[i][1], a[i][2]), (real3) (d[i][0], d[i][1], d[i][2]));
-        real3 torque = (dchidq*(u*eta) + detadq*(u*chi) + dudq*(eta*chi))*switchValue;
-        *(j == 0 ? torque1 : torque2) -= torque;
-    }
-    *totalEnergy += switchValue*energy;
-}
-
-/**
- * Compute the interactions.
- */
-__kernel void computeForce(
-        __global long* restrict forceBuffers, __global long* restrict torqueBuffers,
-        int numAtoms, int numExceptions, __global mixed* restrict energyBuffer, __global const real4* restrict pos,
-        __global const float4* restrict sigParams, __global const float2* restrict epsParams, __global const int* restrict sortedAtoms,
-        __global const real* restrict aMatrix, __global const real* restrict bMatrix, __global const real* restrict gMatrix,
-        __global const int* restrict exclusions, __global const int* restrict exclusionStartIndex,
-        __global const int4* restrict exceptionParticles, __global const float2* restrict exceptionParams
-#ifdef USE_CUTOFF
-        , int maxNeighborBlocks, __global int* restrict neighbors, __global int* restrict neighborIndex, __global int* restrict neighborBlockCount,
-        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
-#endif
-        ) {
-    const unsigned int warp = get_global_id(0)/TILE_SIZE;
-    mixed energy = 0;
-#ifdef USE_CUTOFF
-    const int numBlocks = *neighborBlockCount;
-    if (numBlocks > maxNeighborBlocks)
-        return; // There wasn't enough memory for the neighbor list.
-    for (int block = get_global_id(0); block < numBlocks; block += get_global_size(0)) {
-        // Load parameters for atom1.
-        
-        int atom1 = neighborIndex[block];
-        int index1 = sortedAtoms[atom1];
-        AtomData data1;
-        loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
-        real3 force1 = 0.0f;
-        real3 torque1 = 0.0f;
-        for (int indexInBlock = 0; indexInBlock < NEIGHBOR_BLOCK_SIZE; indexInBlock++) {
-            // Load parameters for atom2.
-            
-            int atom2 = neighbors[NEIGHBOR_BLOCK_SIZE*block+indexInBlock];
-            if (atom2 == -1)
-                continue;
-            int index2 = sortedAtoms[atom2];
-            AtomData data2;
-            loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
-            real3 force2 = 0.0f;
-            real3 torque2 = 0.0f;
-            
-            // Compute the interaction.
-            
-            real3 delta = data1.pos-data2.pos;
-#ifdef USE_PERIODIC
-            APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-            real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-            real sigma = data1.sig.x+data2.sig.x;
-            real epsilon = data1.eps.x*data2.eps.x;
-            computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
-            atom_add(&forceBuffers[index2], (long) (force2.x*0x100000000));
-            atom_add(&forceBuffers[index2+PADDED_NUM_ATOMS], (long) (force2.y*0x100000000));
-            atom_add(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (long) (force2.z*0x100000000));
-            atom_add(&torqueBuffers[index2], (long) (torque2.x*0x100000000));
-            atom_add(&torqueBuffers[index2+PADDED_NUM_ATOMS], (long) (torque2.y*0x100000000));
-            atom_add(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (long) (torque2.z*0x100000000));
-        }
-        atom_add(&forceBuffers[index1], (long) (force1.x*0x100000000));
-        atom_add(&forceBuffers[index1+PADDED_NUM_ATOMS], (long) (force1.y*0x100000000));
-        atom_add(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (long) (force1.z*0x100000000));
-        atom_add(&torqueBuffers[index1], (long) (torque1.x*0x100000000));
-        atom_add(&torqueBuffers[index1+PADDED_NUM_ATOMS], (long) (torque1.y*0x100000000));
-        atom_add(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (long) (torque1.z*0x100000000));
-    }
-#else
-    for (int atom1 = get_global_id(0); atom1 < numAtoms; atom1 += get_global_size(0)) {
-        // Load parameters for atom1.
-        
-        int index1 = sortedAtoms[atom1];
-        AtomData data1;
-        loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
-        real3 force1 = 0.0f;
-        real3 torque1 = 0.0f;
-        int nextExclusion = exclusionStartIndex[atom1];
-        int lastExclusion = exclusionStartIndex[atom1+1];
-        for (int atom2 = atom1+1; atom2 < numAtoms; atom2++) {
-            // Skip over excluded interactions.
-            
-            if (nextExclusion < lastExclusion && exclusions[nextExclusion] == atom2) {
-                nextExclusion++;
-                continue;
-            }
-            
-            // Load parameters for atom2.
-            
-            int index2 = sortedAtoms[atom2];
-            AtomData data2;
-            loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
-            real3 force2 = 0.0f;
-            real3 torque2 = 0.0f;
-            
-            // Compute the interaction.
-            
-            real3 delta = data1.pos-data2.pos;
-            real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-            real sigma = data1.sig.x+data2.sig.x;
-            real epsilon = data1.eps.x*data2.eps.x;
-            computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
-            atom_add(&forceBuffers[index2], (long) (force2.x*0x100000000));
-            atom_add(&forceBuffers[index2+PADDED_NUM_ATOMS], (long) (force2.y*0x100000000));
-            atom_add(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (long) (force2.z*0x100000000));
-            atom_add(&torqueBuffers[index2], (long) (torque2.x*0x100000000));
-            atom_add(&torqueBuffers[index2+PADDED_NUM_ATOMS], (long) (torque2.y*0x100000000));
-            atom_add(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (long) (torque2.z*0x100000000));
-        }
-        atom_add(&forceBuffers[index1], (long) (force1.x*0x100000000));
-        atom_add(&forceBuffers[index1+PADDED_NUM_ATOMS], (long) (force1.y*0x100000000));
-        atom_add(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (long) (force1.z*0x100000000));
-        atom_add(&torqueBuffers[index1], (long) (torque1.x*0x100000000));
-        atom_add(&torqueBuffers[index1+PADDED_NUM_ATOMS], (long) (torque1.y*0x100000000));
-        atom_add(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (long) (torque1.z*0x100000000));
-    }
-#endif
-    
-    // Now compute exceptions.
-    
-    for (int index = get_global_id(0); index < numExceptions; index += get_global_size(0)) {
-        int4 atomIndices = exceptionParticles[index];
-        float2 params = exceptionParams[index];
-        int index1 = atomIndices.x, index2 = atomIndices.y;
-        int atom1 = atomIndices.z, atom2 = atomIndices.w;
-        AtomData data1, data2;
-        loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
-        loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
-        real3 force1 = 0, force2 = 0;
-        real3 torque1 = 0, torque2 = 0;
-        real3 delta = data1.pos-data2.pos;
-        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-        if (r2 < CUTOFF_SQUARED) {
-#endif
-            computeOneInteraction(&data1, &data2, params.x, params.y, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
-            atom_add(&forceBuffers[index1], (long) (force1.x*0x100000000));
-            atom_add(&forceBuffers[index1+PADDED_NUM_ATOMS], (long) (force1.y*0x100000000));
-            atom_add(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (long) (force1.z*0x100000000));
-            atom_add(&forceBuffers[index2], (long) (force2.x*0x100000000));
-            atom_add(&forceBuffers[index2+PADDED_NUM_ATOMS], (long) (force2.y*0x100000000));
-            atom_add(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (long) (force2.z*0x100000000));
-            atom_add(&torqueBuffers[index1], (long) (torque1.x*0x100000000));
-            atom_add(&torqueBuffers[index1+PADDED_NUM_ATOMS], (long) (torque1.y*0x100000000));
-            atom_add(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (long) (torque1.z*0x100000000));
-            atom_add(&torqueBuffers[index2], (long) (torque2.x*0x100000000));
-            atom_add(&torqueBuffers[index2+PADDED_NUM_ATOMS], (long) (torque2.y*0x100000000));
-            atom_add(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (long) (torque2.z*0x100000000));
-#ifdef USE_CUTOFF
-        }
-#endif
-    }
-    energyBuffer[get_global_id(0)] += energy;
-}
-
-/**
- * Convert the torques to forces on the connected particles.
- */
-__kernel void applyTorques(
-        __global long* restrict forceBuffers, __global long* restrict torqueBuffers,
-        int numParticles, __global const real4* restrict posq, __global int2* const restrict axisParticleIndices,
-        __global const int* sortedParticles) {
-    const unsigned int warp = get_global_id(0)/TILE_SIZE;
-    for (int sortedIndex = get_global_id(0); sortedIndex < numParticles; sortedIndex += get_global_size(0)) {
-        int originalIndex = sortedParticles[sortedIndex];
-        real3 pos = posq[originalIndex].xyz;
-        int2 axisParticles = axisParticleIndices[originalIndex];
-        if (axisParticles.x != -1) {
-            // Load the torque.
-
-            real scale = 1/(real) 0x100000000;
-            real3 torque = (real3) (scale*torqueBuffers[originalIndex], scale*torqueBuffers[originalIndex+PADDED_NUM_ATOMS], scale*torqueBuffers[originalIndex+2*PADDED_NUM_ATOMS]);
-            real3 force = 0, xforce = 0, yforce = 0;
-
-            // Apply a force to the x particle.
-            
-            real3 dx = posq[axisParticles.x].xyz-pos;
-            real dx2 = dot(dx, dx);
-            real3 f = cross(torque, dx)/dx2;
-            xforce += f;
-            force -= f;
-            if (axisParticles.y != -1) {
-                // Apply a force to the y particle.  This is based on the component of the torque
-                // that was not already applied to the x particle.
-                
-                real3 dy = posq[axisParticles.y].xyz-pos;
-                real dy2 = dot(dy, dy);
-                real3 torque2 = dx*dot(torque, dx)/dx2;
-                f = cross(torque2, dy)/dy2;
-                yforce += f;
-                force -= f;
-            }
-            atom_add(&forceBuffers[originalIndex], (long) (force.x*0x100000000));
-            atom_add(&forceBuffers[originalIndex+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-            atom_add(&forceBuffers[originalIndex+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
-            atom_add(&forceBuffers[axisParticles.x], (long) (xforce.x*0x100000000));
-            atom_add(&forceBuffers[axisParticles.x+PADDED_NUM_ATOMS], (long) (xforce.y*0x100000000));
-            atom_add(&forceBuffers[axisParticles.x+2*PADDED_NUM_ATOMS], (long) (xforce.z*0x100000000));
-            if (axisParticles.y != -1) {
-                atom_add(&forceBuffers[axisParticles.y], (long) (yforce.x*0x100000000));
-                atom_add(&forceBuffers[axisParticles.y+PADDED_NUM_ATOMS], (long) (yforce.y*0x100000000));
-                atom_add(&forceBuffers[axisParticles.y+2*PADDED_NUM_ATOMS], (long) (yforce.z*0x100000000));
-            }
-        }
-    }
-}
--- a/platforms/opencl/src/kernels/gbsaObc2.cl
+++ b/platforms/opencl/src/kernels/gbsaObc2.cl
-{
-    real invRSquaredOver4 = 0.25f*invR*invR;
-    real rScaledRadiusJ = r+OBC_PARAMS2.y;
-    real rScaledRadiusI = r+OBC_PARAMS1.y;
-    real l_ijJ = RECIP(max((real) OBC_PARAMS1.x, fabs(r-OBC_PARAMS2.y)));
-    real l_ijI = RECIP(max((real) OBC_PARAMS2.x, fabs(r-OBC_PARAMS1.y)));
-    real u_ijJ = RECIP(rScaledRadiusJ);
-    real u_ijI = RECIP(rScaledRadiusI);
-    real l_ij2J = l_ijJ*l_ijJ;
-    real l_ij2I = l_ijI*l_ijI;
-    real u_ij2J = u_ijJ*u_ijJ;
-    real u_ij2I = u_ijI*u_ijI;
-    real t1J = LOG(u_ijJ*RECIP(l_ijJ));
-    real t1I = LOG(u_ijI*RECIP(l_ijI));
-    real t2J = (l_ij2J-u_ij2J);
-    real t2I = (l_ij2I-u_ij2I);
-    real term1 = (0.5f*(0.25f+OBC_PARAMS2.y*OBC_PARAMS2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
-    real term2 = (0.5f*(0.25f+OBC_PARAMS1.y*OBC_PARAMS1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
-    real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1 : (real) 0);
-    tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2 : (real) 0);
-#ifdef USE_CUTOFF
-    bool includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
-#else
-    bool includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2);
-#endif
-    dEdR += (includeInteraction ? tempdEdR : (real) 0);
-}
--- a/platforms/opencl/src/kernels/harmonicAngleForce.cl
+++ b/platforms/opencl/src/kernels/harmonicAngleForce.cl
-float2 angleParams = PARAMS[index];
-real deltaIdeal = theta-angleParams.x;
-energy += 0.5f*angleParams.y*deltaIdeal*deltaIdeal;
-real dEdAngle = angleParams.y*deltaIdeal;
--- a/platforms/opencl/src/kernels/harmonicBondForce.cl
+++ b/platforms/opencl/src/kernels/harmonicBondForce.cl
-float2 bondParams = PARAMS[index];
-real deltaIdeal = r-bondParams.x;
-energy += 0.5f * bondParams.y*deltaIdeal*deltaIdeal;
-real dEdR = bondParams.y * deltaIdeal;
--- a/platforms/opencl/src/kernels/integrationUtilities.cl
+++ b/platforms/opencl/src/kernels/integrationUtilities.cl
-/**
- * Apply a time shift to the velocities before computing kinetic energy.
- */
-__kernel void timeShiftVelocities(__global mixed4* restrict velm, __global const real4* restrict force, real timeShift) {
-    for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
-        mixed4 velocity = velm[index];
-        if (velocity.w != 0.0) {
-            mixed4 f = convert_mixed4(force[index]);
-            velocity.xyz += timeShift*f.xyz*velocity.w;
-            velm[index] = velocity;
-        }
-    }
-}
\ No newline at end of file
--- a/platforms/opencl/src/kernels/langevin.cl
+++ b/platforms/opencl/src/kernels/langevin.cl
-enum {VelScale, ForceScale, NoiseScale, MaxParams};
-
-/**
- * Perform the first step of Langevin integration.
- */
-
-__kernel void integrateLangevinPart1(__global mixed4* restrict velm, __global const real4* restrict force, __global mixed4* restrict posDelta,
-        __global const mixed* restrict paramBuffer, __global const mixed2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) {
-    mixed vscale = paramBuffer[VelScale];
-    mixed fscale = paramBuffer[ForceScale];
-    mixed noisescale = paramBuffer[NoiseScale];
-    mixed stepSize = dt[0].y;
-    int index = get_global_id(0);
-    randomIndex += index;
-    while (index < NUM_ATOMS) {
-        mixed4 velocity = velm[index];
-        if (velocity.w != 0.0) {
-            mixed sqrtInvMass = sqrt(velocity.w);
-            velocity.x = vscale*velocity.x + fscale*velocity.w*force[index].x + noisescale*sqrtInvMass*random[randomIndex].x;
-            velocity.y = vscale*velocity.y + fscale*velocity.w*force[index].y + noisescale*sqrtInvMass*random[randomIndex].y;
-            velocity.z = vscale*velocity.z + fscale*velocity.w*force[index].z + noisescale*sqrtInvMass*random[randomIndex].z;
-            velm[index] = velocity;
-            posDelta[index] = stepSize*velocity;
-        }
-        randomIndex += get_global_size(0);
-        index += get_global_size(0);
-    }
-}
-
-/**
- * Perform the second step of Langevin integration.
- */
-
-__kernel void integrateLangevinPart2(__global real4* restrict posq, __global real4* restrict posqCorrection, __global const mixed4* restrict posDelta, __global mixed4* restrict velm, __global const mixed2* restrict dt) {
-#ifdef SUPPORTS_DOUBLE_PRECISION
-    double invStepSize = 1.0/dt[0].y;
-#else
-    float invStepSize = 1.0f/dt[0].y;
-    float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
-#endif
-    int index = get_global_id(0);
-    while (index < NUM_ATOMS) {
-        mixed4 vel = velm[index];
-        if (vel.w != 0.0) {
-#ifdef USE_MIXED_PRECISION
-            real4 pos1 = posq[index];
-            real4 pos2 = posqCorrection[index];
-            mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-#else
-            real4 pos = posq[index];
-#endif
-            mixed4 delta = posDelta[index];
-            pos.xyz += delta.xyz;
-#ifdef SUPPORTS_DOUBLE_PRECISION
-            vel.xyz = convert_mixed4(invStepSize*convert_double4(delta)).xyz;
-#else
-            vel.xyz = invStepSize*delta.xyz + correction*delta.xyz;
-#endif
-#ifdef USE_MIXED_PRECISION
-            posq[index] = convert_real4(pos);
-            posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
-#else
-            posq[index] = pos;
-#endif
-            velm[index] = vel;
-        }
-        index += get_global_size(0);
-    }
-}
-
-/**
- * Select the step size to use for the next step.
- */
-
-__kernel void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed friction, mixed kT, __global mixed2* restrict dt,
-        __global const mixed4* restrict velm, __global const real4* restrict force, __global mixed* restrict paramBuffer, __local mixed* restrict params, __local mixed* restrict error) {
-    // Calculate the error.
-
-    mixed err = 0.0f;
-    unsigned int index = get_local_id(0);
-    while (index < NUM_ATOMS) {
-        real4 f = force[index];
-        mixed invMass = velm[index].w;
-        err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass*invMass;
-        index += get_global_size(0);
-    }
-    error[get_local_id(0)] = err;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // Sum the errors from all threads.
-
-    for (unsigned int offset = 1; offset < get_local_size(0); offset *= 2) {
-        if (get_local_id(0)+offset < get_local_size(0) && (get_local_id(0)&(2*offset-1)) == 0)
-            error[get_local_id(0)] += error[get_local_id(0)+offset];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (get_global_id(0) == 0) {
-        // Select the new step size.
-
-        mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
-        mixed newStepSize = sqrt(errorTol/totalError);
-        mixed oldStepSize = dt[0].y;
-        if (oldStepSize > 0.0f)
-            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
-        if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
-            newStepSize = oldStepSize; // Keeping dt constant between steps improves the behavior of the integrator.
-        if (newStepSize > maxStepSize)
-            newStepSize = maxStepSize;
-        dt[0].y = newStepSize;
-
-        // Recalculate the integration parameters.
-
-        mixed vscale = exp(-newStepSize*friction);
-        mixed fscale = (friction == 0 ? newStepSize : (1-vscale)/friction);
-        mixed noisescale = sqrt(kT*(1-vscale*vscale));
-        params[VelScale] = vscale;
-        params[ForceScale] = fscale;
-        params[NoiseScale] = noisescale;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_local_id(0) < MaxParams)
-        paramBuffer[get_local_id(0)] = params[get_local_id(0)];
-}
--- a/platforms/opencl/src/kernels/periodicTorsionForce.cl
+++ b/platforms/opencl/src/kernels/periodicTorsionForce.cl
-float4 torsionParams = PARAMS[index];
-real deltaAngle = torsionParams.z*theta-torsionParams.y;
-energy += torsionParams.x*(1.0f+cos(deltaAngle));
-real sinDeltaAngle = sin(deltaAngle);
-real dEdAngle = -torsionParams.x*torsionParams.z*sinDeltaAngle;
--- a/platforms/opencl/src/kernels/random.cl
+++ b/platforms/opencl/src/kernels/random.cl
-/**
- * Generate random numbers
- */
-
-__kernel void generateRandomNumbers(int numValues, __global float4* restrict random, __global uint4* restrict seed) {
-    int index = get_global_id(0);
-    uint4 state = seed[index];
-    unsigned int carry = 0;
-    while (index < numValues) {
-        float4 value;
-
-        // Generate first two values.
-
-        state.x = state.x * 69069 + 1;
-        state.y ^= state.y << 13;
-        state.y ^= state.y >> 17;
-        state.y ^= state.y << 5;
-        unsigned int k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-        unsigned int m = state.w + state.w + state.z + carry;
-        state.z = state.w;
-        state.w = m;
-        carry = k >> 30;
-        float x1 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
-        state.x = state.x * 69069 + 1;
-        state.y ^= state.y << 13;
-        state.y ^= state.y >> 17;
-        state.y ^= state.y << 5;
-        x1 = SQRT(-2.0f * LOG(x1));
-        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-        m = state.w + state.w + state.z + carry;
-        state.z = state.w;
-        state.w = m;
-        carry = k >> 30;
-        float x2 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-        value.x = x1 * cos(2.0f * 3.14159265f * x2);
-        value.y = x1 * sin(2.0f * 3.14159265f * x2);
-
-        // Generate next two values.
-
-        state.x = state.x * 69069 + 1;
-        state.y ^= state.y << 13;
-        state.y ^= state.y >> 17;
-        state.y ^= state.y << 5;
-        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-        m = state.w + state.w + state.z + carry;
-        state.z = state.w;
-        state.w = m;
-        carry = k >> 30;
-        float x3 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
-        state.x = state.x * 69069 + 1;
-        state.y ^= state.y << 13;
-        state.y ^= state.y >> 17;
-        state.y ^= state.y << 5;
-        x3 = SQRT(-2.0f * LOG(x3));
-        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-        m = state.w + state.w + state.z + carry;
-        state.z = state.w;
-        state.w = m;
-        carry = k >> 30;
-        float x4 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-        value.z = x3 * cos(2.0f * 3.14159265f * x4);
-        value.w = x3 * sin(2.0f * 3.14159265f * x4);
-
-        // Record the values.
-
-        random[index] = value;
-        index += get_global_size(0);
-    }
-    seed[get_global_id(0)] = state;
-}
--- a/platforms/opencl/src/kernels/rbTorsionForce.cl
+++ b/platforms/opencl/src/kernels/rbTorsionForce.cl
-float8 torsionParams = PARAMS[index];
-if (theta < 0.0f)
-    theta += PI;
-else
-    theta -= PI;
-cosangle = -cosangle;
-real cosFactor = cosangle;
-real dEdAngle = -torsionParams.s1;
-real rbEnergy = torsionParams.s0;
-rbEnergy += torsionParams.s1*cosFactor;
-dEdAngle -= 2.0f*torsionParams.s2*cosFactor;
-cosFactor *= cosangle;
-dEdAngle -= 3.0f*torsionParams.s3*cosFactor;
-rbEnergy += torsionParams.s2*cosFactor;
-cosFactor *= cosangle;
-dEdAngle -= 4.0f*torsionParams.s4*cosFactor;
-rbEnergy += torsionParams.s3*cosFactor;
-cosFactor *= cosangle;
-dEdAngle -= 5.0f*torsionParams.s5*cosFactor;
-rbEnergy += torsionParams.s4*cosFactor;
-rbEnergy += torsionParams.s5*cosFactor*cosangle;
-energy += rbEnergy;
-dEdAngle *= sin(theta);
--- a/platforms/opencl/src/kernels/rmsd.cl
+++ b/platforms/opencl/src/kernels/rmsd.cl
-// This file contains kernels to compute the RMSD and its gradient using the algorithm described
-// in Coutsias et al, "Using quaternions to calculate RMSD" (doi: 10.1002/jcc.20110).
-
-/**
- * Sum a value over all threads.
- */
-real reduceValue(real value, __local volatile real* temp) {
-    const int thread = get_local_id(0);
-    barrier(CLK_LOCAL_MEM_FENCE);
-    temp[thread] = value;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (uint step = 1; step < 32; step *= 2) {
-        if (thread+step < get_local_size(0) && thread%(2*step) == 0)
-            temp[thread] = temp[thread] + temp[thread+step];
-        SYNC_WARPS;
-    }
-    for (uint step = 32; step < get_local_size(0); step *= 2) {
-        if (thread+step < get_local_size(0) && thread%(2*step) == 0)
-            temp[thread] = temp[thread] + temp[thread+step];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    return temp[0];
-}
-
-/**
- * Perform the first step of computing the RMSD.  This is executed as a single work group.
- */
-__kernel void computeRMSDPart1(int numParticles, __global const real4* restrict posq, __global const real4* restrict referencePos,
-        __global const int* restrict particles, __global real* buffer, __local volatile real* restrict temp) {
-    // Compute the center of the particle positions.
-    
-    real3 center = (real3) 0;
-    for (int i = get_local_id(0); i < numParticles; i += get_local_size(0))
-        center += posq[particles[i]].xyz;
-    center.x = reduceValue(center.x, temp)/numParticles;
-    center.y = reduceValue(center.y, temp)/numParticles;
-    center.z = reduceValue(center.z, temp)/numParticles;
-    
-    // Compute the correlation matrix.
-    
-    real R[3][3] = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
-    real sum = 0;
-    for (int i = get_local_id(0); i < numParticles; i += get_local_size(0)) {
-        int index = particles[i];
-        real3 pos = posq[index].xyz - center;
-        real3 refPos = referencePos[index].xyz;
-        R[0][0] += pos.x*refPos.x;
-        R[0][1] += pos.x*refPos.y;
-        R[0][2] += pos.x*refPos.z;
-        R[1][0] += pos.y*refPos.x;
-        R[1][1] += pos.y*refPos.y;
-        R[1][2] += pos.y*refPos.z;
-        R[2][0] += pos.z*refPos.x;
-        R[2][1] += pos.z*refPos.y;
-        R[2][2] += pos.z*refPos.z;
-        sum += dot(pos, pos);
-    }
-    for (int i = 0; i < 3; i++)
-        for (int j = 0; j < 3; j++)
-            R[i][j] = reduceValue(R[i][j], temp);
-    sum = reduceValue(sum, temp);
-
-    // Copy everything into the output buffer to send back to the host.
-    
-    if (get_local_id(0) == 0) {
-        for (int i = 0; i < 3; i++)
-            for (int j = 0; j < 3; j++)
-                buffer[3*i+j] = R[i][j];
-        buffer[9] = sum;
-        buffer[10] = center.x;
-        buffer[11] = center.y;
-        buffer[12] = center.z;
-    }
-}
-
-/**
- * Apply forces based on the RMSD.
- */
-__kernel void computeRMSDForces(int numParticles, __global const real4* restrict posq, __global const real4* restrict referencePos,
-        __global const int* restrict particles, __global const real* buffer, __global real4* restrict forceBuffers) {
-    real3 center = (real3) (buffer[10], buffer[11], buffer[12]);
-    real scale = 1 / (real) (buffer[9]*numParticles);
-    for (int i = get_global_id(0); i < numParticles; i += get_global_size(0)) {
-        int index = particles[i];
-        real3 pos = posq[index].xyz - center;
-        real3 refPos = referencePos[index].xyz;
-        real3 rotatedRef = (real3) (buffer[0]*refPos.x + buffer[3]*refPos.y + buffer[6]*refPos.z,
-                                    buffer[1]*refPos.x + buffer[4]*refPos.y + buffer[7]*refPos.z,
-                                    buffer[2]*refPos.x + buffer[5]*refPos.y + buffer[8]*refPos.z);
-        forceBuffers[index].xyz -= (pos-rotatedRef)*scale;
-    }
-}
--- a/platforms/opencl/src/kernels/settle.cl
+++ b/platforms/opencl/src/kernels/settle.cl
-mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
-#ifdef USE_MIXED_PRECISION
-    real4 pos1 = posq[index];
-    real4 pos2 = posqCorrection[index];
-    return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-#else
-    return posq[index];
-#endif
-}
-
-/**
- * Enforce constraints on SETTLE clusters
- */
-
-__kernel void applySettle(int numClusters, mixed tol, __global const real4* restrict oldPos, __global const real4* restrict posCorrection, __global mixed4* restrict posDelta, __global const mixed4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
-    int index = get_global_id(0);
-    while (index < numClusters) {
-        // Load the data for this cluster.
-
-        int4 atoms = clusterAtoms[index];
-        float2 params = clusterParams[index];
-        mixed4 apos0 = loadPos(oldPos, posCorrection, atoms.x);
-        mixed4 xp0 = posDelta[atoms.x];
-        mixed4 apos1 = loadPos(oldPos, posCorrection, atoms.y);
-        mixed4 xp1 = posDelta[atoms.y];
-        mixed4 apos2 = loadPos(oldPos, posCorrection, atoms.z);
-        mixed4 xp2 = posDelta[atoms.z];
-        mixed m0 = 1/velm[atoms.x].w;
-        mixed m1 = 1/velm[atoms.y].w;
-        mixed m2 = 1/velm[atoms.z].w;
-
-        // Apply the SETTLE algorithm.
-
-        mixed xb0 = apos1.x-apos0.x;
-        mixed yb0 = apos1.y-apos0.y;
-        mixed zb0 = apos1.z-apos0.z;
-        mixed xc0 = apos2.x-apos0.x;
-        mixed yc0 = apos2.y-apos0.y;
-        mixed zc0 = apos2.z-apos0.z;
-
-        mixed invTotalMass = 1.0f/(m0+m1+m2);
-        mixed xcom = (xp0.x*m0 + (xb0+xp1.x)*m1 + (xc0+xp2.x)*m2) * invTotalMass;
-        mixed ycom = (xp0.y*m0 + (yb0+xp1.y)*m1 + (yc0+xp2.y)*m2) * invTotalMass;
-        mixed zcom = (xp0.z*m0 + (zb0+xp1.z)*m1 + (zc0+xp2.z)*m2) * invTotalMass;
-
-        mixed xa1 = xp0.x - xcom;
-        mixed ya1 = xp0.y - ycom;
-        mixed za1 = xp0.z - zcom;
-        mixed xb1 = xb0 + xp1.x - xcom;
-        mixed yb1 = yb0 + xp1.y - ycom;
-        mixed zb1 = zb0 + xp1.z - zcom;
-        mixed xc1 = xc0 + xp2.x - xcom;
-        mixed yc1 = yc0 + xp2.y - ycom;
-        mixed zc1 = zc0 + xp2.z - zcom;
-
-        mixed xaksZd = yb0*zc0 - zb0*yc0;
-        mixed yaksZd = zb0*xc0 - xb0*zc0;
-        mixed zaksZd = xb0*yc0 - yb0*xc0;
-        mixed xaksXd = ya1*zaksZd - za1*yaksZd;
-        mixed yaksXd = za1*xaksZd - xa1*zaksZd;
-        mixed zaksXd = xa1*yaksZd - ya1*xaksZd;
-        mixed xaksYd = yaksZd*zaksXd - zaksZd*yaksXd;
-        mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
-        mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
-
-        mixed axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
-        mixed aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
-        mixed azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
-        mixed trns11 = xaksXd / axlng;
-        mixed trns21 = yaksXd / axlng;
-        mixed trns31 = zaksXd / axlng;
-        mixed trns12 = xaksYd / aylng;
-        mixed trns22 = yaksYd / aylng;
-        mixed trns32 = zaksYd / aylng;
-        mixed trns13 = xaksZd / azlng;
-        mixed trns23 = yaksZd / azlng;
-        mixed trns33 = zaksZd / azlng;
-
-        mixed xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0;
-        mixed yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0;
-        mixed xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0;
-        mixed yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0;
-        mixed za1d = trns13*xa1 + trns23*ya1 + trns33*za1;
-        mixed xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1;
-        mixed yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1;
-        mixed zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1;
-        mixed xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1;
-        mixed yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1;
-        mixed zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1;
-
-        //                                        --- Step2  A2' ---
-
-        float rc = 0.5*params.y;
-        mixed rb = sqrt(params.x*params.x-rc*rc);
-        mixed ra = rb*(m1+m2)*invTotalMass;
-        rb -= ra;
-        mixed sinphi = za1d / ra;
-        mixed cosphi = sqrt(1.0f - sinphi*sinphi);
-        mixed sinpsi = (zb1d - zc1d) / (2*rc*cosphi);
-        mixed cospsi = sqrt(1.0f - sinpsi*sinpsi);
-
-        mixed ya2d =   ra*cosphi;
-        mixed xb2d = - rc*cospsi;
-        mixed yb2d = - rb*cosphi - rc*sinpsi*sinphi;
-        mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi;
-        mixed xb2d2 = xb2d*xb2d;
-        mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
-        mixed deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y);
-        xb2d -= deltx*0.5;
-
-        //                                        --- Step3  al,be,ga ---
-
-        mixed alpha = (xb2d*(xb0d-xc0d) + yb0d*yb2d + yc0d*yc2d);
-        mixed beta = (xb2d*(yc0d-yb0d) + xb0d*yb2d + xc0d*yc2d);
-        mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;
-
-        mixed al2be2 = alpha*alpha + beta*beta;
-        mixed sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2;
-
-        //                                        --- Step4  A3' ---
-
-        mixed costheta = sqrt(1.0f - sintheta*sintheta);
-        mixed xa3d = - ya2d*sintheta;
-        mixed ya3d =   ya2d*costheta;
-        mixed za3d = za1d;
-        mixed xb3d =   xb2d*costheta - yb2d*sintheta;
-        mixed yb3d =   xb2d*sintheta + yb2d*costheta;
-        mixed zb3d = zb1d;
-        mixed xc3d = - xb2d*costheta - yc2d*sintheta;
-        mixed yc3d = - xb2d*sintheta + yc2d*costheta;
-        mixed zc3d = zc1d;
-
-        //                                        --- Step5  A3 ---
-
-        mixed xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d;
-        mixed ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d;
-        mixed za3 = trns31*xa3d + trns32*ya3d + trns33*za3d;
-        mixed xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d;
-        mixed yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d;
-        mixed zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d;
-        mixed xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d;
-        mixed yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d;
-        mixed zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d;
-
-        xp0.x = xcom + xa3;
-        xp0.y = ycom + ya3;
-        xp0.z = zcom + za3;
-        xp1.x = xcom + xb3 - xb0;
-        xp1.y = ycom + yb3 - yb0;
-        xp1.z = zcom + zb3 - zb0;
-        xp2.x = xcom + xc3 - xc0;
-        xp2.y = ycom + yc3 - yc0;
-        xp2.z = zcom + zc3 - zc0;
-
-        // Record the new positions.
-
-        posDelta[atoms.x] = xp0;
-        posDelta[atoms.y] = xp1;
-        posDelta[atoms.z] = xp2;
-        index += get_global_size(0);
-    }
-}
-
-/**
- * Enforce velocity constraints on SETTLE clusters
- */
-
-__kernel void constrainVelocities(int numClusters, mixed tol, __global const real4* restrict oldPos, __global const real4* restrict posCorrection, __global mixed4* restrict posDelta, __global mixed4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
-    for (int index = get_global_id(0); index < numClusters; index += get_global_size(0)) {
-        // Load the data for this cluster.
-
-        int4 atoms = clusterAtoms[index];
-        mixed4 apos0 = loadPos(oldPos, posCorrection, atoms.x);
-        mixed4 apos1 = loadPos(oldPos, posCorrection, atoms.y);
-        mixed4 apos2 = loadPos(oldPos, posCorrection, atoms.z);
-        mixed4 v0 = velm[atoms.x];
-        mixed4 v1 = velm[atoms.y];
-        mixed4 v2 = velm[atoms.z];
-        
-        // Compute intermediate quantities: the atom masses, the bond directions, the relative velocities,
-        // and the angle cosines and sines.
-        
-        mixed mA = 1/v0.w;
-        mixed mB = 1/v1.w;
-        mixed mC = 1/v2.w;
-        mixed4 eAB = apos1-apos0;
-        mixed4 eBC = apos2-apos1;
-        mixed4 eCA = apos0-apos2;
-        eAB.xyz /= sqrt(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
-        eBC.xyz /= sqrt(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
-        eCA.xyz /= sqrt(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
-        mixed vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
-        mixed vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
-        mixed vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;
-        mixed cA = -(eAB.x*eCA.x + eAB.y*eCA.y + eAB.z*eCA.z);
-        mixed cB = -(eAB.x*eBC.x + eAB.y*eBC.y + eAB.z*eBC.z);
-        mixed cC = -(eBC.x*eCA.x + eBC.y*eCA.y + eBC.z*eCA.z);
-        mixed s2A = 1-cA*cA;
-        mixed s2B = 1-cB*cB;
-        mixed s2C = 1-cC*cC;
-        
-        // Solve the equations.  These are different from those in the SETTLE paper (JCC 13(8), pp. 952-962, 1992), because
-        // in going from equations B1 to B2, they make the assumption that mB=mC (but don't bother to mention they're
-        // making that assumption).  We allow all three atoms to have different masses.
-        
-        mixed mABCinv = 1/(mA*mB*mC);
-        mixed denom = (((s2A*mB+s2B*mA)*mC+(s2A*mB*mB+2*(cA*cB*cC+1)*mA*mB+s2B*mA*mA))*mC+s2C*mA*mB*(mA+mB))*mABCinv;
-        mixed tab = ((cB*cC*mA-cA*mB-cA*mC)*vCA + (cA*cC*mB-cB*mC-cB*mA)*vBC + (s2C*mA*mA*mB*mB*mABCinv+(mA+mB+mC))*vAB)/denom;
-        mixed tbc = ((cA*cB*mC-cC*mB-cC*mA)*vCA + (s2A*mB*mB*mC*mC*mABCinv+(mA+mB+mC))*vBC + (cA*cC*mB-cB*mA-cB*mC)*vAB)/denom;
-        mixed tca = ((s2B*mA*mA*mC*mC*mABCinv+(mA+mB+mC))*vCA + (cA*cB*mC-cC*mB-cC*mA)*vBC + (cB*cC*mA-cA*mB-cA*mC)*vAB)/denom;
-        v0.xyz += (tab*eAB.xyz - tca*eCA.xyz)*v0.w;
-        v1.xyz += (tbc*eBC.xyz - tab*eAB.xyz)*v1.w;
-        v2.xyz += (tca*eCA.xyz - tbc*eBC.xyz)*v2.w;
-        velm[atoms.x] = v0;
-        velm[atoms.y] = v1;
-        velm[atoms.z] = v2;
-    }
-}
\ No newline at end of file
--- a/platforms/opencl/src/kernels/shakeHydrogens.cl
+++ b/platforms/opencl/src/kernels/shakeHydrogens.cl
-mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
-#ifdef USE_MIXED_PRECISION
-    real4 pos1 = posq[index];
-    real4 pos2 = posqCorrection[index];
-    return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-#else
-    return posq[index];
-#endif
-}
-
-/**
- * Enforce constraints on SHAKE clusters
- */
-
-__kernel void applyShakeToHydrogens(int numClusters, mixed tol, __global const real4* restrict oldPos, __global const real4* restrict posCorrection, __global mixed4* restrict posDelta, __global const int4* restrict clusterAtoms, __global const float4* restrict clusterParams) {
-    int index = get_global_id(0);
-    while (index < numClusters) {
-        // Load the data for this cluster.
-
-        int4 atoms = clusterAtoms[index];
-        float4 params = clusterParams[index];
-        mixed4 pos = loadPos(oldPos, posCorrection, atoms.x);
-        mixed4 xpi = posDelta[atoms.x];
-        mixed4 pos1 = loadPos(oldPos, posCorrection, atoms.y);
-        mixed4 xpj1 = posDelta[atoms.y];
-        mixed4 pos2 = {0.0f, 0.0f, 0.0f, 0.0f};
-        mixed4 xpj2 = {0.0f, 0.0f, 0.0f, 0.0f};
-        float invMassCentral = params.x;
-        float avgMass = params.y;
-        float d2 = params.z;
-        float invMassPeripheral = params.w;
-        if (atoms.z != -1) {
-            pos2 = loadPos(oldPos, posCorrection, atoms.z);
-            xpj2 = posDelta[atoms.z];
-        }
-        mixed4 pos3 = {0.0f, 0.0f, 0.0f, 0.0f};
-        mixed4 xpj3 = {0.0f, 0.0f, 0.0f, 0.0f};
-        if (atoms.w != -1) {
-            pos3 = loadPos(oldPos, posCorrection, atoms.w);
-            xpj3 = posDelta[atoms.w];
-        }
-
-        // Precompute quantities.
-
-        mixed4 rij1 = pos-pos1;
-        mixed4 rij2 = pos-pos2;
-        mixed4 rij3 = pos-pos3;
-        mixed rij1sq = rij1.x*rij1.x + rij1.y*rij1.y + rij1.z*rij1.z;
-        mixed rij2sq = rij2.x*rij2.x + rij2.y*rij2.y + rij2.z*rij2.z;
-        mixed rij3sq = rij3.x*rij3.x + rij3.y*rij3.y + rij3.z*rij3.z;
-        mixed ld1 = d2-rij1sq;
-        mixed ld2 = d2-rij2sq;
-        mixed ld3 = d2-rij3sq;
-
-        // Iterate until convergence.
-
-        int converged = false;
-        int iteration = 0;
-        while (iteration < 15 && !converged) {
-            converged = true;
-#ifdef CONSTRAIN_VELOCITIES
-            mixed4 rpij = xpi-xpj1;
-            mixed rrpr = rpij.x*rij1.x + rpij.y*rij1.y + rpij.z*rij1.z;
-            mixed delta = -2.0f*avgMass*rrpr/rij1sq;
-            mixed4 dr = rij1*delta;
-            xpi.xyz += dr.xyz*invMassCentral;
-            xpj1.xyz -= dr.xyz*invMassPeripheral;
-            if (fabs(delta) > tol)
-                converged = false;
-            if (atoms.z != -1) {
-                rpij = xpi-xpj2;
-                rrpr = rpij.x*rij2.x + rpij.y*rij2.y + rpij.z*rij2.z;
-                delta = -2.0f*avgMass*rrpr/rij2sq;
-                dr = rij2*delta;
-                xpi.xyz += dr.xyz*invMassCentral;
-                xpj2.xyz -= dr.xyz*invMassPeripheral;
-                if (fabs(delta) > tol)
-                    converged = false;
-            }
-            if (atoms.w != -1) {
-                rpij = xpi-xpj3;
-                rrpr = rpij.x*rij3.x + rpij.y*rij3.y + rpij.z*rij3.z;
-                delta = -2.0f*avgMass*rrpr/rij3sq;
-                dr = rij3*delta;
-                xpi.xyz += dr.xyz*invMassCentral;
-                xpj3.xyz -= dr.xyz*invMassPeripheral;
-                if (fabs(delta) > tol)
-                    converged = false;
-            }
-#else
-            mixed4 rpij = xpi-xpj1;
-            mixed rpsqij = rpij.x*rpij.x + rpij.y*rpij.y + rpij.z*rpij.z;
-            mixed rrpr = rij1.x*rpij.x + rij1.y*rpij.y + rij1.z*rpij.z;
-            mixed diff = fabs(ld1-2.0f*rrpr-rpsqij) / (d2*tol);
-            if (diff >= 1.0f) {
-                mixed acor  = (ld1-2.0f*rrpr-rpsqij)*avgMass / (rrpr+rij1sq);
-                mixed4 dr = rij1*acor;
-                xpi.xyz += dr.xyz*invMassCentral;
-                xpj1.xyz -= dr.xyz*invMassPeripheral;
-                converged = false;
-            }
-            if (atoms.z != -1) {
-                rpij.xyz = xpi.xyz-xpj2.xyz;
-                rpsqij = rpij.x*rpij.x + rpij.y*rpij.y + rpij.z*rpij.z;
-                rrpr = rij2.x*rpij.x + rij2.y*rpij.y + rij2.z*rpij.z;
-                diff = fabs(ld2-2.0f*rrpr-rpsqij) / (d2*tol);
-                if (diff >= 1.0f) {
-                    mixed acor  = (ld2 - 2.0f*rrpr - rpsqij)*avgMass / (rrpr + rij2sq);
-                    mixed4 dr = rij2*acor;
-                    xpi.xyz += dr.xyz*invMassCentral;
-                    xpj2.xyz -= dr.xyz*invMassPeripheral;
-                    converged = false;
-                }
-            }
-            if (atoms.w != -1) {
-                rpij.xyz = xpi.xyz-xpj3.xyz;
-                rpsqij = rpij.x*rpij.x + rpij.y*rpij.y + rpij.z*rpij.z;
-                rrpr = rij3.x*rpij.x + rij3.y*rpij.y + rij3.z*rpij.z;
-                diff = fabs(ld3 - 2.0f*rrpr - rpsqij) / (d2*tol);
-                if (diff >= 1.0f) {
-                    mixed acor  = (ld3-2.0f*rrpr-rpsqij)*avgMass / (rrpr+rij3sq);
-                    mixed4 dr = rij3*acor;
-                    xpi.xyz += dr.xyz*invMassCentral;
-                    xpj3.xyz -= dr.xyz*invMassPeripheral;
-                    converged = false;
-                }
-            }
-#endif
-            iteration++;
-        }
-
-        // Record the new positions.
-
-        posDelta[atoms.x] = xpi;
-        posDelta[atoms.y] = xpj1;
-        if (atoms.z != -1)
-            posDelta[atoms.z] = xpj2;
-        if (atoms.w != -1)
-            posDelta[atoms.w] = xpj3;
-        index += get_global_size(0);
-    }
-}
--- a/platforms/opencl/src/kernels/utilities.cl
+++ b/platforms/opencl/src/kernels/utilities.cl
@@ -81,21 +81,26 @@ __kernel void reduceReal4Buffer(__global real4* restrict buffer, int bufferSize,
    }
 }

-#ifdef SUPPORTS_64_BIT_ATOMICS
 /**
 * Sum the various buffers containing forces.
 */
-__kernel void reduceForces(__global const long* restrict longBuffer, __global real4* restrict buffer, int bufferSize, int numBuffers) {
+__kernel void reduceForces(__global long* restrict longBuffer, __global real4* restrict buffer, int bufferSize, int numBuffers) {
    int totalSize = bufferSize*numBuffers;
    real scale = 1/(real) 0x100000000;
    for (int index = get_global_id(0); index < bufferSize; index += get_global_size(0)) {
+#ifdef SUPPORTS_64_BIT_ATOMICS
        real4 sum = (real4) (scale*longBuffer[index], scale*longBuffer[index+bufferSize], scale*longBuffer[index+2*bufferSize], 0);
+#else
+        real4 sum = (real4) 0;
+#endif
        for (int i = index; i < totalSize; i += bufferSize)
            sum += buffer[i];
        buffer[index] = sum;
+        longBuffer[index] = (long) (sum.x*0x100000000);
+        longBuffer[index+bufferSize] = (long) (sum.y*0x100000000);
+        longBuffer[index+2*bufferSize] = (long) (sum.z*0x100000000);
    }
 }
-#endif

 /**
 * Sum the energy buffer.

--- a/platforms/opencl/src/kernels/verlet.cl
+++ b/platforms/opencl/src/kernels/verlet.cl
-/**
- * Perform the first step of verlet integration.
- */
-
-__kernel void integrateVerletPart1(int numAtoms, __global const mixed2* restrict dt, __global const real4* restrict posq, __global const real4* restrict posqCorrection, __global mixed4* restrict velm, __global const real4* restrict force, __global mixed4* restrict posDelta) {
-    mixed2 stepSize = dt[0];
-    mixed dtPos = stepSize.y;
-    mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
-    int index = get_global_id(0);
-    while (index < numAtoms) {
-        mixed4 velocity = velm[index];
-        if (velocity.w != 0.0) {
-#ifdef USE_MIXED_PRECISION
-            real4 pos1 = posq[index];
-            real4 pos2 = posqCorrection[index];
-            mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-#else
-            real4 pos = posq[index];
-#endif
-            velocity.x += force[index].x*dtVel*velocity.w;
-            velocity.y += force[index].y*dtVel*velocity.w;
-            velocity.z += force[index].z*dtVel*velocity.w;
-            pos.xyz = velocity.xyz*dtPos;
-            posDelta[index] = pos;
-            velm[index] = velocity;
-        }
-        index += get_global_size(0);
-    }
-}
-
-/**
- * Perform the second step of verlet integration.
- */
-
-__kernel void integrateVerletPart2(int numAtoms, __global mixed2* restrict dt, __global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict velm, __global const mixed4* restrict posDelta) {
-    mixed2 stepSize = dt[0];
-#ifdef SUPPORTS_DOUBLE_PRECISION
-    double oneOverDt = 1.0/stepSize.y;
-#else
-    float oneOverDt = 1.0f/stepSize.y;
-    float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
-#endif
-    if (get_global_id(0) == 0)
-        dt[0].x = stepSize.y;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    int index = get_global_id(0);
-    while (index < numAtoms) {
-        mixed4 velocity = velm[index];
-        if (velocity.w != 0.0) {
-#ifdef USE_MIXED_PRECISION
-            real4 pos1 = posq[index];
-            real4 pos2 = posqCorrection[index];
-            mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-#else
-            real4 pos = posq[index];
-#endif
-            mixed4 delta = posDelta[index];
-            pos.xyz += delta.xyz;
-#ifdef SUPPORTS_DOUBLE_PRECISION
-            velocity.xyz = convert_mixed4(convert_double4(delta)*oneOverDt).xyz;
-#else
-            velocity.xyz = delta.xyz*oneOverDt + delta.xyz*correction;
-#endif
-#ifdef USE_MIXED_PRECISION
-            posq[index] = convert_real4(pos);
-            posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
-#else
-            posq[index] = pos;
-#endif
-            velm[index] = velocity;
-        }
-        index += get_global_size(0);
-    }
-}
-
-/**
- * Select the step size to use for the next step.
- */
-
-__kernel void selectVerletStepSize(int numAtoms, mixed maxStepSize, mixed errorTol, __global mixed2* restrict dt, __global const mixed4* restrict velm, __global const real4* restrict force, __local mixed* restrict error) {
-    // Calculate the error.
-
-    mixed err = 0;
-    int index = get_local_id(0);
-    while (index < numAtoms) {
-        real4 f = force[index];
-        mixed invMass = velm[index].w;
-        err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass*invMass;
-        index += get_global_size(0);
-    }
-    error[get_local_id(0)] = err;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // Sum the errors from all threads.
-
-    for (unsigned int offset = 1; offset < get_local_size(0); offset *= 2) {
-        if (get_local_id(0)+offset < get_local_size(0) && (get_local_id(0)&(2*offset-1)) == 0)
-            error[get_local_id(0)] += error[get_local_id(0)+offset];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (get_local_id(0) == 0) {
-        mixed totalError = sqrt(error[0]/(numAtoms*3));
-        mixed newStepSize = sqrt(errorTol/totalError);
-        mixed oldStepSize = dt[0].y;
-        if (oldStepSize > 0.0f)
-            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
-        if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
-            newStepSize = oldStepSize; // Keeping dt constant between steps improves the behavior of the integrator.
-        if (newStepSize > maxStepSize)
-            newStepSize = maxStepSize;
-        dt[0].y = newStepSize;
-    }
-}
--- a/platforms/opencl/src/kernels/virtualSites.cl
+++ b/platforms/opencl/src/kernels/virtualSites.cl
-/**
- * Load the position of a particle.
- */
-mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
-#ifdef USE_MIXED_PRECISION
-    real4 pos1 = posq[index];
-    real4 pos2 = posqCorrection[index];
-    return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-#else
-    return posq[index];
-#endif
-}
-
-/**
- * Store the position of a particle.
- */
-void storePos(__global real4* restrict posq, __global real4* restrict posqCorrection, int index, mixed4 pos) {
-#ifdef USE_MIXED_PRECISION
-    posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
-    posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
-#else
-    posq[index] = pos;
-#endif
-}
-
-/**
- * Compute the positions of virtual sites
- */
-__kernel void computeVirtualSites(__global real4* restrict posq,
-#ifdef USE_MIXED_PRECISION
-        __global real4* restrict posqCorrection,
-#endif
-        __global const int4* restrict avg2Atoms, __global const real2* restrict avg2Weights,
-        __global const int4* restrict avg3Atoms, __global const real4* restrict avg3Weights,
-        __global const int4* restrict outOfPlaneAtoms, __global const real4* restrict outOfPlaneWeights,
-        __global const int* restrict localCoordsIndex, __global const int* restrict localCoordsAtoms,
-        __global const real* restrict localCoordsWeights, __global const real4* restrict localCoordsPos,
-        __global const int* restrict localCoordsStartIndex) {
-#ifndef USE_MIXED_PRECISION
-        __global real4* posqCorrection = 0;
-#endif
-    
-    // Two particle average sites.
-    
-    for (int index = get_global_id(0); index < NUM_2_AVERAGE; index += get_global_size(0)) {
-        int4 atoms = avg2Atoms[index];
-        real2 weights = avg2Weights[index];
-        mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
-        mixed4 pos1 = loadPos(posq, posqCorrection, atoms.y);
-        mixed4 pos2 = loadPos(posq, posqCorrection, atoms.z);
-        pos.xyz = pos1.xyz*weights.x + pos2.xyz*weights.y;
-        storePos(posq, posqCorrection, atoms.x, pos);
-    }
-    
-    // Three particle average sites.
-    
-    for (int index = get_global_id(0); index < NUM_3_AVERAGE; index += get_global_size(0)) {
-        int4 atoms = avg3Atoms[index];
-        real4 weights = avg3Weights[index];
-        mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
-        mixed4 pos1 = loadPos(posq, posqCorrection, atoms.y);
-        mixed4 pos2 = loadPos(posq, posqCorrection, atoms.z);
-        mixed4 pos3 = loadPos(posq, posqCorrection, atoms.w);
-        pos.xyz = pos1.xyz*weights.x + pos2.xyz*weights.y + pos3.xyz*weights.z;
-        storePos(posq, posqCorrection, atoms.x, pos);
-    }
-    
-    // Out of plane sites.
-    
-    for (int index = get_global_id(0); index < NUM_OUT_OF_PLANE; index += get_global_size(0)) {
-        int4 atoms = outOfPlaneAtoms[index];
-        real4 weights = outOfPlaneWeights[index];
-        mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
-        mixed4 pos1 = loadPos(posq, posqCorrection, atoms.y);
-        mixed4 pos2 = loadPos(posq, posqCorrection, atoms.z);
-        mixed4 pos3 = loadPos(posq, posqCorrection, atoms.w);
-        mixed4 v12 = pos2-pos1;
-        mixed4 v13 = pos3-pos1;
-        pos.xyz = pos1.xyz + v12.xyz*weights.x + v13.xyz*weights.y + cross(v12, v13).xyz*weights.z;
-        storePos(posq, posqCorrection, atoms.x, pos);
-    }
-    
-    // Local coordinates sites.
-    
-    for (int index = get_global_id(0); index < NUM_LOCAL_COORDS; index += get_global_size(0)) {
-        int siteAtomIndex = localCoordsIndex[index];
-        int start = localCoordsStartIndex[index];
-        int end = localCoordsStartIndex[index+1];
-        mixed3 origin = 0, xdir = 0, ydir = 0;
-        for (int j = start; j < end; j++) {
-            mixed3 pos = loadPos(posq, posqCorrection, localCoordsAtoms[j]).xyz;
-            origin += pos*localCoordsWeights[3*j];
-            xdir += pos*localCoordsWeights[3*j+1];
-            ydir += pos*localCoordsWeights[3*j+2];
-        }
-        mixed3 zdir = cross(xdir, ydir);
-        mixed normXdir = sqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z);
-        mixed normZdir = sqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z);
-        mixed invNormXdir = (normXdir > 0 ? 1/normXdir : 0);
-        mixed invNormZdir = (normZdir > 0 ? 1/normZdir : 0);
-        xdir *= invNormXdir;
-        zdir *= invNormZdir;
-        ydir = cross(zdir, xdir);
-        mixed3 localPosition = convert_mixed4(localCoordsPos[index]).xyz;
-        mixed4 pos = loadPos(posq, posqCorrection, siteAtomIndex);
-        pos.x = origin.x + xdir.x*localPosition.x + ydir.x*localPosition.y + zdir.x*localPosition.z;
-        pos.y = origin.y + xdir.y*localPosition.x + ydir.y*localPosition.y + zdir.y*localPosition.z;
-        pos.z = origin.z + xdir.z*localPosition.x + ydir.z*localPosition.y + zdir.z*localPosition.z;
-        storePos(posq, posqCorrection, siteAtomIndex, pos);
-    }
-}
-
-#ifdef HAS_OVERLAPPING_VSITES
-    #ifdef SUPPORTS_64_BIT_ATOMICS
-        // We will use 64 bit atomics for force redistribution.
-
-        #define ADD_FORCE(index, f) addForce(index, f, longForce);
-
-        #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-
-        void addForce(int index, real4 f,  __global long* longForce) {
-            atom_add(&longForce[index], (long) (f.x*0x100000000));
-            atom_add(&longForce[index+PADDED_NUM_ATOMS], (long) (f.y*0x100000000));
-            atom_add(&longForce[index+2*PADDED_NUM_ATOMS], (long) (f.z*0x100000000));
-        }
-
-        __kernel void addDistributedForces(__global const long* restrict longForces, __global real4* restrict forces) {
-            real scale = 1/(real) 0x100000000;
-            for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
-                real4 f = (real4) (scale*longForces[index], scale*longForces[index+PADDED_NUM_ATOMS], scale*longForces[index+2*PADDED_NUM_ATOMS], 0);
-                forces[index] += f;
-            }
-        }
-    #else
-        // 64 bit atomics aren't supported, so we have to use atomic_cmpxchg() for force redistribution.
-        
-        #define ADD_FORCE(index, f) addForce(index, f, force);
-
-        void atomicAddFloat(__global float* p, float v) {
-            __global int* ip = (__global int*) p;
-            while (true) {
-                union {
-                    float f;
-                    int i;
-                } oldval, newval;
-                oldval.f = *p;
-                newval.f = oldval.f+v;
-                if (atomic_cmpxchg(ip, oldval.i, newval.i) == oldval.i)
-                    return;
-            }
-        }
-
-        void addForce(int index, float4 f, __global float4* force) {
-            __global float* components = (__global float*) force;
-            atomicAddFloat(&components[4*index], f.x);
-            atomicAddFloat(&components[4*index+1], f.y);
-            atomicAddFloat(&components[4*index+2], f.z);
-        }
-    #endif
-#else
-    // There are no overlapping virtual sites, so we can just store forces directly.
-
-    #define ADD_FORCE(index, f) force[index].xyz += (f).xyz;
-#endif
-
-/**
- * Distribute forces from virtual sites to the atoms they are based on.
- */
-__kernel void distributeForces(__global const real4* restrict posq, __global real4* restrict force,
-#ifdef SUPPORTS_64_BIT_ATOMICS
-        __global long* restrict longForce,
-#endif
-#ifdef USE_MIXED_PRECISION
-        __global real4* restrict posqCorrection,
-#endif
-        __global const int4* restrict avg2Atoms, __global const real2* restrict avg2Weights,
-        __global const int4* restrict avg3Atoms, __global const real4* restrict avg3Weights,
-        __global const int4* restrict outOfPlaneAtoms, __global const real4* restrict outOfPlaneWeights,
-        __global const int* restrict localCoordsIndex, __global const int* restrict localCoordsAtoms,
-        __global const real* restrict localCoordsWeights, __global const real4* restrict localCoordsPos,
-        __global const int* restrict localCoordsStartIndex) {
-#ifndef USE_MIXED_PRECISION
-        __global real4* posqCorrection = 0;
-#endif
-
-    // Two particle average sites.
-    
-    for (int index = get_global_id(0); index < NUM_2_AVERAGE; index += get_global_size(0)) {
-        int4 atoms = avg2Atoms[index];
-        real2 weights = avg2Weights[index];
-        real4 f = force[atoms.x];
-        ADD_FORCE(atoms.y, f*weights.x);
-        ADD_FORCE(atoms.z, f*weights.y);
-    }
-    
-    // Three particle average sites.
-    
-    for (int index = get_global_id(0); index < NUM_3_AVERAGE; index += get_global_size(0)) {
-        int4 atoms = avg3Atoms[index];
-        real4 weights = avg3Weights[index];
-        real4 f = force[atoms.x];
-        ADD_FORCE(atoms.y, f*weights.x);
-        ADD_FORCE(atoms.z, f*weights.y);
-        ADD_FORCE(atoms.w, f*weights.z);
-    }
-    
-    // Out of plane sites.
-    
-    for (int index = get_global_id(0); index < NUM_OUT_OF_PLANE; index += get_global_size(0)) {
-        int4 atoms = outOfPlaneAtoms[index];
-        real4 weights = outOfPlaneWeights[index];
-        mixed4 pos1 = loadPos(posq, posqCorrection, atoms.y);
-        mixed4 pos2 = loadPos(posq, posqCorrection, atoms.z);
-        mixed4 pos3 = loadPos(posq, posqCorrection, atoms.w);
-        mixed4 v12 = pos2-pos1;
-        mixed4 v13 = pos3-pos1;
-        real4 f = force[atoms.x];
-        real4 fp2 = (real4) (weights.x*f.x - weights.z*v13.z*f.y + weights.z*v13.y*f.z,
-                   weights.z*v13.z*f.x + weights.x*f.y - weights.z*v13.x*f.z,
-                  -weights.z*v13.y*f.x + weights.z*v13.x*f.y + weights.x*f.z, 0.0f);
-        real4 fp3 = (real4) (weights.y*f.x + weights.z*v12.z*f.y - weights.z*v12.y*f.z,
-                  -weights.z*v12.z*f.x + weights.y*f.y + weights.z*v12.x*f.z,
-                   weights.z*v12.y*f.x - weights.z*v12.x*f.y + weights.y*f.z, 0.0f);
-        ADD_FORCE(atoms.y, f-fp2-fp3);
-        ADD_FORCE(atoms.z, fp2);
-        ADD_FORCE(atoms.w, fp3);
-    }
-    
-    // Local coordinates sites.
-    
-    for (int index = get_global_id(0); index < NUM_LOCAL_COORDS; index += get_global_size(0)) {
-        int siteAtomIndex = localCoordsIndex[index];
-        int start = localCoordsStartIndex[index];
-        int end = localCoordsStartIndex[index+1];
-        mixed3 origin = 0, xdir = 0, ydir = 0;
-        for (int j = start; j < end; j++) {
-            mixed3 pos = loadPos(posq, posqCorrection, localCoordsAtoms[j]).xyz;
-            origin += pos*localCoordsWeights[3*j];
-            xdir += pos*localCoordsWeights[3*j+1];
-            ydir += pos*localCoordsWeights[3*j+2];
-        }
-        mixed3 zdir = cross(xdir, ydir);
-        mixed normXdir = sqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z);
-        mixed normZdir = sqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z);
-        mixed invNormXdir = (normXdir > 0 ? 1/normXdir : 0);
-        mixed invNormZdir = (normZdir > 0 ? 1/normZdir : 0);
-        mixed3 dx = xdir*invNormXdir;
-        mixed3 dz = zdir*invNormZdir;
-        mixed3 dy = cross(dz, dx);
-        mixed3 localPosition = convert_mixed4(localCoordsPos[index]).xyz;
-
-        // The derivatives for this case are very complicated.  They were computed with SymPy then simplified by hand.
-
-        real4 f = force[siteAtomIndex];
-        mixed3 fp1 = localPosition*f.x;
-        mixed3 fp2 = localPosition*f.y;
-        mixed3 fp3 = localPosition*f.z;
-        for (int j = start; j < end; j++) {
-            real originWeight = localCoordsWeights[3*j];
-            real wx = localCoordsWeights[3*j+1];
-            real wy = localCoordsWeights[3*j+2];
-            mixed wxScaled = wx*invNormXdir;
-            mixed t1 = (wx*ydir.x-wy*xdir.x)*invNormZdir;
-            mixed t2 = (wx*ydir.y-wy*xdir.y)*invNormZdir;
-            mixed t3 = (wx*ydir.z-wy*xdir.z)*invNormZdir;
-            mixed sx = t3*dz.y-t2*dz.z;
-            mixed sy = t1*dz.z-t3*dz.x;
-            mixed sz = t2*dz.x-t1*dz.y;
-            real4 fresult = 0;
-            fresult.x += fp1.x*wxScaled*(1-dx.x*dx.x) + fp1.z*(dz.x*sx   ) + fp1.y*((-dx.x*dy.x     )*wxScaled + dy.x*sx - dx.y*t2 - dx.z*t3) + f.x*originWeight;
-            fresult.y += fp1.x*wxScaled*( -dx.x*dx.y) + fp1.z*(dz.x*sy+t3) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled + dy.x*sy + dx.y*t1);
-            fresult.z += fp1.x*wxScaled*( -dx.x*dx.z) + fp1.z*(dz.x*sz-t2) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled + dy.x*sz + dx.z*t1);
-            fresult.x += fp2.x*wxScaled*( -dx.y*dx.x) + fp2.z*(dz.y*sx-t3) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled - dy.y*sx - dx.x*t2);
-            fresult.y += fp2.x*wxScaled*(1-dx.y*dx.y) + fp2.z*(dz.y*sy   ) - fp2.y*(( dx.y*dy.y     )*wxScaled - dy.y*sy + dx.x*t1 + dx.z*t3) + f.y*originWeight;
-            fresult.z += fp2.x*wxScaled*( -dx.y*dx.z) + fp2.z*(dz.y*sz+t1) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled - dy.y*sz - dx.z*t2);
-            fresult.x += fp3.x*wxScaled*( -dx.z*dx.x) + fp3.z*(dz.z*sx+t2) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled + dy.z*sx + dx.x*t3);
-            fresult.y += fp3.x*wxScaled*( -dx.z*dx.y) + fp3.z*(dz.z*sy-t1) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled + dy.z*sy + dx.y*t3);
-            fresult.z += fp3.x*wxScaled*(1-dx.z*dx.z) + fp3.z*(dz.z*sz   ) + fp3.y*((-dx.z*dy.z     )*wxScaled + dy.z*sz - dx.x*t1 - dx.y*t2) + f.z*originWeight;
-            ADD_FORCE(localCoordsAtoms[j], fresult);
-        }
-    }
-}