Merge https://github.com/openmm/openmm

5a06df78 · tic20 · 8dd60914 · a9223eea · 8dd60914 · 5a06df78
Commit 5a06df78 authored Mar 04, 2020 by tic20
20 changed files
--- a/platforms/opencl/src/kernels/cmapTorsionForce.cl
+++ b/platforms/opencl/src/kernels/cmapTorsionForce.cl
-const real PI = 3.14159265358979323846f;
-// Compute the first angle.
-real4 v0a = (real4) (pos1.xyz-pos2.xyz, 0.0f);
-real4 v1a = (real4) (pos3.xyz-pos2.xyz, 0.0f);
-real4 v2a = (real4) (pos3.xyz-pos4.xyz, 0.0f);
-#if APPLY_PERIODIC
-APPLY_PERIODIC_TO_DELTA(v0a)
-APPLY_PERIODIC_TO_DELTA(v1a)
-APPLY_PERIODIC_TO_DELTA(v2a)
-#endif
-real4 cp0a = cross(v0a, v1a);
-real4 cp1a = cross(v1a, v2a);
-real cosangle = dot(normalize(cp0a), normalize(cp1a));
-real angleA;
-if (cosangle > 0.99f || cosangle < -0.99f) {
-    // We're close to the singularity in acos(), so take the cross product and use asin() instead.
-    real4 cross_prod = cross(cp0a, cp1a);
-    real scale = dot(cp0a, cp0a)*dot(cp1a, cp1a);
-    angleA = asin(SQRT(dot(cross_prod, cross_prod)/scale));
-    if (cosangle < 0.0f)
-        angleA = PI-angleA;
-}
-else
-   angleA = acos(cosangle);
-angleA = (dot(v0a, cp1a) >= 0 ? angleA : -angleA);
-angleA = fmod(angleA+2.0f*PI, 2.0f*PI);
-// Compute the second angle.
-real4 v0b = (real4) (pos5.xyz-pos6.xyz, 0.0f);
-real4 v1b = (real4) (pos7.xyz-pos6.xyz, 0.0f);
-real4 v2b = (real4) (pos7.xyz-pos8.xyz, 0.0f);
-#if APPLY_PERIODIC
-APPLY_PERIODIC_TO_DELTA(v0b)
-APPLY_PERIODIC_TO_DELTA(v1b)
-APPLY_PERIODIC_TO_DELTA(v2b)
-#endif
-real4 cp0b = cross(v0b, v1b);
-real4 cp1b = cross(v1b, v2b);
-cosangle = dot(normalize(cp0b), normalize(cp1b));
-real angleB;
-if (cosangle > 0.99f || cosangle < -0.99f) {
-    // We're close to the singularity in acos(), so take the cross product and use asin() instead.
-    real4 cross_prod = cross(cp0b, cp1b);
-    real scale = dot(cp0b, cp0b)*dot(cp1b, cp1b);
-    angleB = asin(SQRT(dot(cross_prod, cross_prod)/scale));
-    if (cosangle < 0.0f)
-        angleB = PI-angleB;
-}
-else
-   angleB = acos(cosangle);
-angleB = (dot(v0b, cp1b) >= 0 ? angleB : -angleB);
-angleB = fmod(angleB+2.0f*PI, 2.0f*PI);
-// Identify which patch this is in.
-int2 pos = MAP_POS[MAPS[index]];
-int size = pos.y;
-real delta = 2*PI/size;
-int s = (int) (angleA/delta);
-int t = (int) (angleB/delta);
-float4 c[4];
-int coeffIndex = pos.x+4*(s+size*t);
-c[0] = COEFF[coeffIndex];
-c[1] = COEFF[coeffIndex+1];
-c[2] = COEFF[coeffIndex+2];
-c[3] = COEFF[coeffIndex+3];
-real da = angleA/delta-s;
-real db = angleB/delta-t;
-// Evaluate the spline to determine the energy and gradients.
-real torsionEnergy = 0.0f;
-real dEdA = 0.0f;
-real dEdB = 0.0f;
-torsionEnergy = da*torsionEnergy + ((c[3].w*db + c[3].z)*db + c[3].y)*db + c[3].x;
-dEdA = db*dEdA + (3.0f*c[3].w*da + 2.0f*c[2].w)*da + c[1].w;
-dEdB = da*dEdB + (3.0f*c[3].w*db + 2.0f*c[3].z)*db + c[3].y;
-torsionEnergy = da*torsionEnergy + ((c[2].w*db + c[2].z)*db + c[2].y)*db + c[2].x;
-dEdA = db*dEdA + (3.0f*c[3].z*da + 2.0f*c[2].z)*da + c[1].z;
-dEdB = da*dEdB + (3.0f*c[2].w*db + 2.0f*c[2].z)*db + c[2].y;
-torsionEnergy = da*torsionEnergy + ((c[1].w*db + c[1].z)*db + c[1].y)*db + c[1].x;
-dEdA = db*dEdA + (3.0f*c[3].y*da + 2.0f*c[2].y)*da + c[1].y;
-dEdB = da*dEdB + (3.0f*c[1].w*db + 2.0f*c[1].z)*db + c[1].y;
-torsionEnergy = da*torsionEnergy + ((c[0].w*db + c[0].z)*db + c[0].y)*db + c[0].x;
-dEdA = db*dEdA + (3.0f*c[3].x*da + 2.0f*c[2].x)*da + c[1].x;
-dEdB = da*dEdB + (3.0f*c[0].w*db + 2.0f*c[0].z)*db + c[0].y;
-dEdA /= delta;
-dEdB /= delta;
-energy += torsionEnergy;
-// Apply the force to the first torsion.
-real normCross1 = dot(cp0a, cp0a);
-real normSqrBC = dot(v1a, v1a);
-real normBC = SQRT(normSqrBC);
-real normCross2 = dot(cp1a, cp1a);
-real dp = 1.0f/normSqrBC;
-real4 ff = (real4) ((-dEdA*normBC)/normCross1, dot(v0a, v1a)*dp, dot(v2a, v1a)*dp, (dEdA*normBC)/normCross2);
-real4 force1 = ff.x*cp0a;
-real4 force4 = ff.w*cp1a;
-real4 d = ff.y*force1 - ff.z*force4;
-real4 force2 = d-force1;
-real4 force3 = -d-force4;
-// Apply the force to the second torsion.
-normCross1 = dot(cp0b, cp0b);
-normSqrBC = dot(v1b, v1b);
-normBC = SQRT(normSqrBC);
-normCross2 = dot(cp1b, cp1b);
-dp = 1.0f/normSqrBC;
-ff = (real4) ((-dEdB*normBC)/normCross1, dot(v0b, v1b)*dp, dot(v2b, v1b)*dp, (dEdB*normBC)/normCross2);
-real4 force5 = ff.x*cp0b;
-real4 force8 = ff.w*cp1b;
-d = ff.y*force5 - ff.z*force8;
-real4 force6 = d-force5;
-real4 force7 = -d-force8;
--- a/platforms/opencl/src/kernels/common.cl
+++ b/platforms/opencl/src/kernels/common.cl
+/**
+ * This file contains OpenCL definitions for the macros and functions needed for the
+ * common compute framework.
+ */
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#ifdef SUPPORTS_64_BIT_ATOMICS
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#endif
+#define KERNEL __kernel
+#define DEVICE
+#define LOCAL __local
+#define LOCAL_ARG __local
+#define GLOBAL __global
+#define RESTRICT restrict
+#define LOCAL_ID get_local_id(0)
+#define LOCAL_SIZE get_local_size(0)
+#define GLOBAL_ID get_global_id(0)
+#define GLOBAL_SIZE get_global_size(0)
+#define GROUP_ID get_group_id(0)
+#define NUM_GROUPS get_num_groups(0)
+#define SYNC_THREADS barrier(CLK_LOCAL_MEM_FENCE+CLK_GLOBAL_MEM_FENCE);
+#define MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE+CLK_GLOBAL_MEM_FENCE);
+#define ATOMIC_ADD(dest, value) atom_add(dest, value)
+typedef long mm_long;
+typedef unsigned long mm_ulong;
+#define make_short2(x...) ((short2) (x))
+#define make_short3(x...) ((short3) (x))
+#define make_short4(x...) ((short4) (x))
+#define make_int2(x...) ((int2) (x))
+#define make_int3(x...) ((int3) (x))
+#define make_int4(x...) ((int4) (x))
+#define make_float2(x...) ((float2) (x))
+#define make_float3(x...) ((float3) (x))
+#define make_float4(x...) ((float4) (x))
+#define make_double2(x...) ((double2) (x))
+#define make_double3(x...) ((double3) (x))
+#define make_double4(x...) ((double4) (x))
+#define trimTo3(v) (v).xyz
+// OpenCL has overloaded versions of standard math functions for single and double
+// precision arguments.  CUDA has separate functions.  To allow them to be called
+// consistently, we define the "single precision" functions to just be synonyms
+// for the standard ones.
+#define sqrtf(x) sqrt(x)
+#define rsqrtf(x) rsqrt(x)
+#define expf(x) exp(x)
+#define logf(x) log(x)
+#define powf(x) pow(x)
+#define cosf(x) cos(x)
+#define sinf(x) sin(x)
+#define tanf(x) tan(x)
+#define acosf(x) acos(x)
+#define asinf(x) asin(x)
+#define atanf(x) atan(x)
+#define atan2f(x, y) atan2(x, y)
--- a/platforms/opencl/src/kernels/customCompoundBond.cl
+++ b/platforms/opencl/src/kernels/customCompoundBond.cl
-/**
- * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
- */
-real4 ccb_delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
-        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
-    real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
-    if (periodic)
-        APPLY_PERIODIC_TO_DELTA(result);
-    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
-    return result;
-}
-/**
- * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
- */
-real ccb_computeAngle(real4 vec1, real4 vec2) {
-    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
-    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
-    real angle;
-    if (cosine > 0.99f || cosine < -0.99f) {
-        // We're close to the singularity in acos(), so take the cross product and use asin() instead.
-        real4 crossProduct = cross(vec1, vec2);
-        real scale = vec1.w*vec2.w;
-        angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
-        if (cosine < 0)
-            angle = M_PI-angle;
-    }
-    else
-       angle = acos(cosine);
-    return angle;
-}
-/**
- * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
- */
-real4 ccb_computeCross(real4 vec1, real4 vec2) {
-    real4 result = cross(vec1, vec2);
-    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
-    return result;
-}
--- a/platforms/opencl/src/kernels/customExternalForce.cl
+++ b/platforms/opencl/src/kernels/customExternalForce.cl
-COMPUTE_FORCE
-real4 force1 = (real4) (-dEdX, -dEdY, -dEdZ, 0);
--- a/platforms/opencl/src/kernels/customGBChainRule.cl
+++ b/platforms/opencl/src/kernels/customGBChainRule.cl
-#ifdef USE_CUTOFF
-if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED) {
-#else
-if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
-#endif
-#ifdef USE_SYMMETRIC
-    real tempForce = 0.0f;
-#else
-    real4 tempForce1 = (real4) 0;
-    real4 tempForce2 = (real4) 0;
-#endif
-    COMPUTE_FORCE
-#ifdef USE_SYMMETRIC
-    dEdR += tempForce*invR;
-#else
-    dEdR1 += tempForce1;
-    dEdR2 += tempForce2;
-#endif
-}
--- a/platforms/opencl/src/kernels/customGBGradientChainRule.cl
+++ b/platforms/opencl/src/kernels/customGBGradientChainRule.cl
-/**
- * Compute chain rule terms for computed values that depend explicitly on particle coordinates.
- */
-__kernel void computeGradientChainRuleTerms(__global real4* restrict forceBuffers, __global const real4* restrict posq
-        PARAMETER_ARGUMENTS) {
-    INIT_PARAM_DERIVS
-    unsigned int index = get_global_id(0);
-    while (index < NUM_ATOMS) {
-        real4 pos = posq[index];
-        real4 force = forceBuffers[index];
-        COMPUTE_FORCES
-        forceBuffers[index] = force;
-        index += get_global_size(0);
-    }
-    SAVE_PARAM_DERIVS
-}
--- a/platforms/opencl/src/kernels/customIntegrator.cl
+++ b/platforms/opencl/src/kernels/customIntegrator.cl
-__kernel void computeFloatSum(__global const float* restrict sumBuffer, __global float* result, int bufferSize) {
-    __local float tempBuffer[WORK_GROUP_SIZE];
-    const unsigned int thread = get_local_id(0);
-    float sum = 0;
-    for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
-        sum += sumBuffer[index];
-    tempBuffer[thread] = sum;
-    for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
-            tempBuffer[thread] += tempBuffer[thread+i];
-    }
-    if (thread == 0)
-        *result = tempBuffer[0];
-}
-#ifdef SUPPORTS_DOUBLE_PRECISION
-__kernel void computeDoubleSum(__global const double* restrict sumBuffer, __global double* result, int bufferSize) {
-    __local double tempBuffer[WORK_GROUP_SIZE];
-    const unsigned int thread = get_local_id(0);
-    double sum = 0;
-    for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
-        sum += sumBuffer[index];
-    tempBuffer[thread] = sum;
-    for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
-        barrier(CLK_LOCAL_MEM_FENCE);
-        if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
-            tempBuffer[thread] += tempBuffer[thread+i];
-    }
-    if (thread == 0)
-        *result = tempBuffer[0];
-}
-#endif
-__kernel void applyPositionDeltas(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta) {
-    for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
-#ifdef USE_MIXED_PRECISION
-        real4 pos1 = posq[index];
-        real4 pos2 = posqCorrection[index];
-        mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-#else
-        real4 pos = posq[index];
-#endif
-        pos.xyz += posDelta[index].xyz;
-#ifdef USE_MIXED_PRECISION
-        posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
-        posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
-#else
-        posq[index] = pos;
-#endif
-        posDelta[index] = (mixed4) 0;
-    }
-}
-__kernel void generateRandomNumbers(int numValues, __global float4* restrict random, __global uint4* restrict seed) {
-    uint4 state = seed[get_global_id(0)];
-    unsigned int carry = 0;
-    for (int index = get_global_id(0); index < numValues; index += get_global_size(0)) {
-        // Generate three uniform random numbers.
-        state.x = state.x * 69069 + 1;
-        state.y ^= state.y << 13;
-        state.y ^= state.y >> 17;
-        state.y ^= state.y << 5;
-        unsigned int k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-        unsigned int m = state.w + state.w + state.z + carry;
-        state.z = state.w;
-        state.w = m;
-        carry = k >> 30;
-        float x1 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
-        state.x = state.x * 69069 + 1;
-        state.y ^= state.y << 13;
-        state.y ^= state.y >> 17;
-        state.y ^= state.y << 5;
-        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-        m = state.w + state.w + state.z + carry;
-        state.z = state.w;
-        state.w = m;
-        carry = k >> 30;
-        float x2 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
-        state.x = state.x * 69069 + 1;
-        state.y ^= state.y << 13;
-        state.y ^= state.y >> 17;
-        state.y ^= state.y << 5;
-        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-        m = state.w + state.w + state.z + carry;
-        state.z = state.w;
-        state.w = m;
-        carry = k >> 30;
-        float x3 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
-        // Record the values.
-        random[index] = (float4) (x1, x2, x3, 0.0f);
-    }
-    seed[get_global_id(0)] = state;
-}
--- a/platforms/opencl/src/kernels/customIntegratorPerDof.cl
+++ b/platforms/opencl/src/kernels/customIntegratorPerDof.cl
-/**
- * Load the position of a particle.
- */
-mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
-#ifdef USE_MIXED_PRECISION
-    real4 pos1 = posq[index];
-    real4 pos2 = posqCorrection[index];
-    return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-#else
-    return posq[index];
-#endif
-}
-/**
- * Store the position of a particle.
- */
-void storePos(__global real4* restrict posq, __global real4* restrict posqCorrection, int index, mixed4 pos) {
-#ifdef USE_MIXED_PRECISION
-    posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
-    posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
-#else
-    posq[index] = pos;
-#endif
-}
-__kernel void computePerDof(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta,
-        __global mixed4* restrict velm, __global const real4* restrict force, __global const mixed2* restrict dt, __global const mixed* restrict globals,
-        __global mixed* restrict sum, __global const float4* restrict gaussianValues, unsigned int gaussianBaseIndex, __global const float4* restrict uniformValues,
-        const mixed energy, __global mixed* restrict energyParamDerivs
-        PARAMETER_ARGUMENTS) {
-    mixed stepSize = dt[0].y;
-    int index = get_global_id(0);
-    while (index < NUM_ATOMS) {
-#ifdef LOAD_POS_AS_DELTA
-        mixed4 position = loadPos(posq, posqCorrection, index)+posDelta[index];
-#else
-        mixed4 position = loadPos(posq, posqCorrection, index);
-#endif
-        mixed4 velocity = velm[index];
-        mixed4 f = convert_mixed4(force[index]);
-        mixed mass = 1/velocity.w;
-        if (velocity.w != 0.0) {
-            int gaussianIndex = gaussianBaseIndex;
-            int uniformIndex = 0;
-            COMPUTE_STEP
-        }
-        index += get_global_size(0);
-    }
-}
--- a/platforms/opencl/src/kernels/customNonbonded.cl
+++ b/platforms/opencl/src/kernels/customNonbonded.cl
-#ifdef USE_CUTOFF
-if (!isExcluded && r2 < CUTOFF_SQUARED) {
-#else
-if (!isExcluded) {
-#endif
-    real tempForce = 0.0f;
-    real switchValue = 1, switchDeriv = 0;
-#if USE_SWITCH
-    if (r > SWITCH_CUTOFF) {
-        real x = r-SWITCH_CUTOFF;
-        switchValue = 1+x*x*x*(SWITCH_C3+x*(SWITCH_C4+x*SWITCH_C5));
-        switchDeriv = x*x*(3*SWITCH_C3+x*(4*SWITCH_C4+x*5*SWITCH_C5));
-    }
-#endif
-    COMPUTE_FORCE
-#if USE_SWITCH
-    tempForce = tempForce*switchValue - customEnergy*switchDeriv;
-    tempEnergy += customEnergy*switchValue;
-#else
-    tempEnergy += customEnergy;
-#endif
-    dEdR += tempForce*invR;
-}
--- a/platforms/opencl/src/kernels/gayBerne.cl
+++ b/platforms/opencl/src/kernels/gayBerne.cl
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
-#define TILE_SIZE 32
-#define NEIGHBOR_BLOCK_SIZE 32
-/**
- * Calculate the ellipsoid coordinate frames and associated matrices.
- */
-__kernel void computeEllipsoidFrames(int numParticles, __global const real4* restrict posq, __global int2* const restrict axisParticleIndices,
-        __global const float4* restrict sigParams, __global const float4* restrict scale, __global real* restrict aMatrix,
-        __global real* restrict bMatrix, __global real* restrict gMatrix, __global const int* sortedParticles) {
-    for (int sortedIndex = get_global_id(0); sortedIndex < numParticles; sortedIndex += get_global_size(0)) {
-        // Compute the local coordinate system of the ellipsoid;
-        int originalIndex = sortedParticles[sortedIndex];
-        real3 pos = posq[originalIndex].xyz;
-        int2 axisParticles = axisParticleIndices[originalIndex];
-        real3 xdir, ydir, zdir;
-        if (axisParticles.x == -1) {
-            xdir = (real3) (1, 0, 0);
-            ydir = (real3) (0, 1, 0);
-        }
-        else {
-            xdir = pos-posq[axisParticles.x].xyz;
-            xdir = normalize(xdir);
-            if (axisParticles.y == -1) {
-                if (xdir.y > -0.5f && xdir.y < 0.5f)
-                    ydir = (real3) (0, 1, 0);
-                else
-                    ydir = (real3) (1, 0, 0);
-            }
-            else
-                ydir = pos-posq[axisParticles.y].xyz;
-            ydir -= xdir*dot(xdir, ydir);
-            ydir = normalize(ydir);
-        }
-        zdir = cross(xdir, ydir);
-        // Compute matrices we will need later.
-        __global real (*a)[3] = (__global real (*)[3]) (aMatrix+sortedIndex*9);
-        __global real (*b)[3] = (__global real (*)[3]) (bMatrix+sortedIndex*9);
-        __global real (*g)[3] = (__global real (*)[3]) (gMatrix+sortedIndex*9);
-        a[0][0] = xdir.x;
-        a[0][1] = xdir.y;
-        a[0][2] = xdir.z;
-        a[1][0] = ydir.x;
-        a[1][1] = ydir.y;
-        a[1][2] = ydir.z;
-        a[2][0] = zdir.x;
-        a[2][1] = zdir.y;
-        a[2][2] = zdir.z;
-        float4 sig = sigParams[originalIndex];
-        float3 r2 = sig.yzw;
-        float3 e2 = scale[originalIndex].xyz;
-        for (int i = 0; i < 3; i++)
-            for (int j = 0; j < 3; j++) {
-                b[i][j] = a[0][i]*e2.x*a[0][j] + a[1][i]*e2.y*a[1][j] + a[2][i]*e2.z*a[2][j];
-                g[i][j] = a[0][i]*r2.x*a[0][j] + a[1][i]*r2.y*a[1][j] + a[2][i]*r2.z*a[2][j];
-            }
-    }
-}
-/**
- * Find a bounding box for the atoms in each block.
- */
-__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        __global const int* sortedAtoms, __global const real4* restrict posq, __global real4* restrict sortedPos, __global real4* restrict blockCenter,
-        __global real4* restrict blockBoundingBox, __global int* restrict neighborBlockCount) {
-    int index = get_global_id(0);
-    int base = index*TILE_SIZE;
-    while (base < numAtoms) {
-        real4 pos = posq[sortedAtoms[base]];
-        sortedPos[base] = pos;
-#ifdef USE_PERIODIC
-        APPLY_PERIODIC_TO_POS(pos)
-#endif
-        real4 minPos = pos;
-        real4 maxPos = pos;
-        int last = min(base+TILE_SIZE, numAtoms);
-        for (int i = base+1; i < last; i++) {
-            pos = posq[sortedAtoms[i]];
-            sortedPos[i] = pos;
-#ifdef USE_PERIODIC
-            real4 center = 0.5f*(maxPos+minPos);
-            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
-#endif
-            minPos = min(minPos, pos);
-            maxPos = max(maxPos, pos);
-        }
-        real4 blockSize = 0.5f*(maxPos-minPos);
-        blockBoundingBox[index] = blockSize;
-        blockCenter[index] = 0.5f*(maxPos+minPos);
-        index += get_global_size(0);
-        base = index*TILE_SIZE;
-    }
-    if (get_global_id(0) == 0)
-        *neighborBlockCount = 0;
-}
-/**
- * This is called by findNeighbors() to write a block to the neighbor list.
- */
-void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuffer, int maxNeighborBlocks, __global int* restrict neighbors,
-        __global int* restrict neighborIndex, __global int* restrict neighborBlockCount) {
-    int blockIndex = atom_add(neighborBlockCount, 1);
-    if (blockIndex >= maxNeighborBlocks)
-        return; // We don't have enough room for the neighbor list.
-    neighborIndex[blockIndex] = atom1;
-    int baseIndex = blockIndex*NEIGHBOR_BLOCK_SIZE;
-    for (int i = 0; i < numAtomsInBuffer; i++)
-        neighbors[baseIndex+i] = neighborBuffer[i];
-    for (int i = numAtomsInBuffer; i < NEIGHBOR_BLOCK_SIZE; i++)
-        neighbors[baseIndex+i] = -1;
-}
-/**
- * Build a list of neighbors for each atom.
- */
-__kernel void findNeighbors(int numAtoms, int maxNeighborBlocks, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        __global real4* restrict sortedPos, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict neighbors,
-        __global int* restrict neighborIndex, __global int* restrict neighborBlockCount, __global const int* restrict exclusions, __global const int* restrict exclusionStartIndex) {
-    const int numBlocks = (numAtoms+TILE_SIZE-1)/TILE_SIZE;
-    int neighborBuffer[NEIGHBOR_BLOCK_SIZE];
-    for (int atom1 = get_global_id(0); atom1 < numAtoms; atom1 += get_global_size(0)) {
-        int nextExclusion = exclusionStartIndex[atom1];
-        int lastExclusion = exclusionStartIndex[atom1+1];
-        real4 pos = sortedPos[atom1];
-        int nextBufferIndex = 0;
-        // Loop over atom blocks and compute the distance of this atom from each one's bounding box.
-        for (int block = (atom1+1)/TILE_SIZE; block < numBlocks; block++) {
-            real4 center = blockCenter[block];
-            real4 blockSize = blockBoundingBox[block];
-            real4 blockDelta = center-pos;
-#ifdef USE_PERIODIC
-            APPLY_PERIODIC_TO_DELTA(blockDelta)
-#endif
-            blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSize.x);
-            blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSize.y);
-            blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSize.z);
-            if (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z >= CUTOFF_SQUARED)
-                continue;
-            // Loop over atoms within this block.
-            int first = max(block*TILE_SIZE, atom1+1);
-            int last = min((block+1)*TILE_SIZE, numAtoms);
-            for (int atom2 = first; atom2 < last; atom2++) {
-                // Skip over excluded interactions.
-                if (nextExclusion < lastExclusion && exclusions[nextExclusion] >= atom2) {
-                    nextExclusion++;
-                    continue;
-                }
-                real4 delta = pos-sortedPos[atom2];
-#ifdef USE_PERIODIC
-                APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-                if (r2 < CUTOFF_SQUARED) {
-                    neighborBuffer[nextBufferIndex++] = atom2;
-                    if (nextBufferIndex == NEIGHBOR_BLOCK_SIZE) {
-                        storeNeighbors(atom1, neighborBuffer, nextBufferIndex, maxNeighborBlocks, neighbors, neighborIndex, neighborBlockCount);
-                        nextBufferIndex = 0;
-                    }
-                }
-            }
-        }
-        if (nextBufferIndex > 0)
-            storeNeighbors(atom1, neighborBuffer, nextBufferIndex, maxNeighborBlocks, neighbors, neighborIndex, neighborBlockCount);
-    }
-}
-typedef struct {
-    float4 sig;
-    float2 eps;
-    real3 pos;
-    real a[3][3], b[3][3], g[3][3];
-} AtomData;
-void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, __global const real4* restrict pos, __global const float4* restrict sigParams,
-        __global const float2* restrict epsParams, __global const real* restrict aMatrix, __global const real* restrict bMatrix, __global const real* restrict gMatrix) {
-    data->sig = sigParams[originalIndex];
-    data->eps = epsParams[originalIndex];
-    data->pos = pos[sortedIndex].xyz;
-    for (int i = 0; i < 3; i++)
-        for (int j = 0; j < 3; j++) {
-            int k = 9*sortedIndex+3*i+j;
-            data->a[i][j] = aMatrix[k];
-            data->b[i][j] = bMatrix[k];
-            data->g[i][j] = gMatrix[k];
-        }
-}
-real3 matrixVectorProduct(real (*m)[3], real3 v) {
-    return (real3) (m[0][0]*v.x + m[0][1]*v.y + m[0][2]*v.z,
-                    m[1][0]*v.x + m[1][1]*v.y + m[1][2]*v.z,
-                    m[2][0]*v.x + m[2][1]*v.y + m[2][2]*v.z);
-}
-real3 vectorMatrixProduct(real3 v, real (*m)[3]) {
-    return (real3) (m[0][0]*v.x + m[1][0]*v.y + m[2][0]*v.z,
-                    m[0][1]*v.x + m[1][1]*v.y + m[2][1]*v.z,
-                    m[0][2]*v.x + m[1][2]*v.y + m[2][2]*v.z);
-}
-void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) {
-    result[0][0] = a[0][0]+b[0][0];
-    result[0][1] = a[0][1]+b[0][1];
-    result[0][2] = a[0][2]+b[0][2];
-    result[1][0] = a[1][0]+b[1][0];
-    result[1][1] = a[1][1]+b[1][1];
-    result[1][2] = a[1][2]+b[1][2];
-    result[2][0] = a[2][0]+b[2][0];
-    result[2][1] = a[2][1]+b[2][1];
-    result[2][2] = a[2][2]+b[2][2];
-}
-real determinant(real (*m)[3]) {
-    return (m[0][0]*m[1][1]*m[2][2] + m[0][1]*m[1][2]*m[2][0] + m[0][2]*m[1][0]*m[2][1] -
-            m[0][0]*m[1][2]*m[2][1] - m[0][1]*m[1][0]*m[2][2] - m[0][2]*m[1][1]*m[2][0]);
-}
-void matrixInverse(real (*result)[3], real (*m)[3]) {
-    real invDet = RECIP(determinant(m));
-    result[0][0] = invDet*(m[1][1]*m[2][2] - m[1][2]*m[2][1]);
-    result[1][0] = -invDet*(m[1][0]*m[2][2] - m[1][2]*m[2][0]);
-    result[2][0] = invDet*(m[1][0]*m[2][1] - m[1][1]*m[2][0]);
-    result[0][1] = -invDet*(m[0][1]*m[2][2] - m[0][2]*m[2][1]);
-    result[1][1] = invDet*(m[0][0]*m[2][2] - m[0][2]*m[2][0]);
-    result[2][1] = -invDet*(m[0][0]*m[2][1] - m[0][1]*m[2][0]);
-    result[0][2] = invDet*(m[0][1]*m[1][2] - m[0][2]*m[1][1]);
-    result[1][2] = -invDet*(m[0][0]*m[1][2] - m[0][2]*m[1][0]);
-    result[2][2] = invDet*(m[0][0]*m[1][1] - m[0][1]*m[1][0]);
-}
-void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real epsilon, real3 dr, real r2, real3* force1, real3* force2, real3* torque1, real3* torque2, mixed *totalEnergy) {
-    real rInv = RSQRT(r2);
-    real r = r2*rInv;
-    real3 drUnit = dr*rInv;
-    // Compute the switching function.
-    real switchValue = 1, switchDeriv = 0;
-    #if USE_SWITCH
-    if (r > SWITCH_CUTOFF) {
-        real x = r-SWITCH_CUTOFF;
-        switchValue = 1+x*x*x*(SWITCH_C3+x*(SWITCH_C4+x*SWITCH_C5));
-        switchDeriv = x*x*(3*SWITCH_C3+x*(4*SWITCH_C4+x*5*SWITCH_C5));
-    }
-    #endif
-    // Compute vectors and matrices we'll be needing.
-    real B12[3][3], G12[3][3], B12inv[3][3], G12inv[3][3];
-    matrixSum(B12, data1->b, data2->b);
-    matrixSum(G12, data1->g, data2->g);
-    matrixInverse(B12inv, B12);
-    matrixInverse(G12inv, G12);
-    real detG12 = determinant(G12);
-    // Estimate the distance between the ellipsoids and compute the first terms needed for the energy.
-    real sigma12 = 1/SQRT(0.5f*dot(drUnit, matrixVectorProduct(G12inv, drUnit)));
-    real h12 = r - sigma12;
-    real rho = sigma/(h12+sigma);
-    real rho2 = rho*rho;
-    real rho6 = rho2*rho2*rho2;
-    real u = 4*epsilon*(rho6*rho6-rho6);
-    real eta = SQRT(2*data1->eps.y*data2->eps.y/detG12);
-    real chi = 2*dot(drUnit, matrixVectorProduct(B12inv, drUnit));
-    chi *= chi;
-    real energy = u*eta*chi;
-    // Compute the terms needed for the force.
-    real3 kappa = matrixVectorProduct(G12inv, dr);
-    real3 iota = matrixVectorProduct(B12inv, dr);
-    real rInv2 = rInv*rInv;
-    real dUSLJdr = 24*epsilon*(2*rho6-1)*rho6*rho/sigma;
-    real temp = 0.5f*sigma12*sigma12*sigma12*rInv2;
-    real3 dudr = (drUnit + (kappa-drUnit*dot(kappa, drUnit))*temp)*dUSLJdr;
-    real3 dchidr = (iota-drUnit*dot(iota, drUnit))*(-8*rInv2*SQRT(chi));
-    real3 force = (dchidr*u + dudr*chi)*(eta*switchValue) - drUnit*(energy*switchDeriv);
-    *force1 += force;
-    *force2 -= force;
-    // Compute the terms needed for the torque.
-    for (int j = 0; j < 2; j++) {
-        real (*a)[3] = (j == 0 ? data1->a : data2->a);
-        real (*b)[3] = (j == 0 ? data1->b : data2->b);
-        real (*g)[3] = (j == 0 ? data1->g : data2->g);
-        float4 sig = (j == 0 ? data1->sig : data2->sig);
-        real3 dudq = cross(vectorMatrixProduct(kappa, g), kappa*(temp*dUSLJdr));
-        real3 dchidq = cross(vectorMatrixProduct(iota, b), iota)*(-4*rInv2);
-        real3 scale = (real3) (sig.y, sig.z, sig.w)*(-0.5f*eta/detG12);
-        real d[3][3];
-        d[0][0] = scale.x*(2*a[0][0]*(G12[1][1]*G12[2][2] - G12[1][2]*G12[2][1]) +
-                             a[0][2]*(G12[1][2]*G12[0][1] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])) +
-                             a[0][1]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])));
-        d[0][1] = scale.x*(  a[0][0]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])) +
-                           2*a[0][1]*(G12[0][0]*G12[2][2] - G12[2][0]*G12[0][2]) +
-                             a[0][2]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])));
-        d[0][2] = scale.x*(  a[0][0]*(G12[0][1]*G12[1][2] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])) +
-                             a[0][1]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])) +
-                           2*a[0][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
-        d[1][0] = scale.y*(2*a[1][0]*(G12[1][1]*G12[2][2] - G12[1][2]*G12[2][1]) +
-                             a[1][1]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])) +
-                             a[1][2]*(G12[1][2]*G12[0][1] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])));
-        d[1][1] = scale.y*(  a[1][0]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])) +
-                           2*a[1][1]*(G12[2][2]*G12[0][0] - G12[2][0]*G12[0][2]) +
-                             a[1][2]*(G12[1][0]*G12[0][2] + G12[0][1]*G12[2][0] - G12[0][0]*(G12[1][2] + G12[2][1])));
-        d[1][2] = scale.y*(  a[1][0]*(G12[0][1]*G12[1][2] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])) +
-                             a[1][1]*(G12[1][0]*G12[0][2] + G12[0][1]*G12[2][0] - G12[0][0]*(G12[1][2] + G12[2][1])) +
-                           2*a[1][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
-        d[2][0] = scale.z*(2*a[2][0]*(G12[1][1]*G12[2][2] - G12[2][1]*G12[1][2]) +
-                             a[2][1]*(G12[0][2]*G12[2][1] + G12[1][2]*G12[2][0] - G12[2][2]*(G12[0][1] + G12[1][0])) +
-                             a[2][2]*(G12[0][1]*G12[1][2] + G12[2][1]*G12[1][0] - G12[1][1]*(G12[0][2] + G12[2][0])));
-        d[2][1] = scale.z*(  a[2][0]*(G12[0][2]*G12[2][1] + G12[1][2]*G12[2][0] - G12[2][2]*(G12[0][1] + G12[1][0])) +
-                           2*a[2][1]*(G12[0][0]*G12[2][2] - G12[0][2]*G12[2][0]) +
-                             a[2][2]*(G12[1][0]*G12[0][2] + G12[0][1]*G12[2][0] - G12[0][0]*(G12[1][2] + G12[2][1])));
-        d[2][2] = scale.z*(  a[2][0]*(G12[0][1]*G12[1][2] + G12[2][1]*G12[1][0] - G12[1][1]*(G12[0][2] + G12[2][0])) +
-                             a[2][1]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])) +
-                           2*a[2][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
-        real3 detadq = 0;
-        for (int i = 0; i < 3; i++)
-            detadq += cross((real3) (a[i][0], a[i][1], a[i][2]), (real3) (d[i][0], d[i][1], d[i][2]));
-        real3 torque = (dchidq*(u*eta) + detadq*(u*chi) + dudq*(eta*chi))*switchValue;
-        *(j == 0 ? torque1 : torque2) -= torque;
-    }
-    *totalEnergy += switchValue*energy;
-}
-/**
- * Compute the interactions.
- */
-__kernel void computeForce(
-        __global long* restrict forceBuffers, __global long* restrict torqueBuffers,
-        int numAtoms, int numExceptions, __global mixed* restrict energyBuffer, __global const real4* restrict pos,
-        __global const float4* restrict sigParams, __global const float2* restrict epsParams, __global const int* restrict sortedAtoms,
-        __global const real* restrict aMatrix, __global const real* restrict bMatrix, __global const real* restrict gMatrix,
-        __global const int* restrict exclusions, __global const int* restrict exclusionStartIndex,
-        __global const int4* restrict exceptionParticles, __global const float2* restrict exceptionParams
-#ifdef USE_CUTOFF
-        , int maxNeighborBlocks, __global int* restrict neighbors, __global int* restrict neighborIndex, __global int* restrict neighborBlockCount,
-        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
-#endif
-        ) {
-    const unsigned int warp = get_global_id(0)/TILE_SIZE;
-    mixed energy = 0;
-#ifdef USE_CUTOFF
-    const int numBlocks = *neighborBlockCount;
-    if (numBlocks > maxNeighborBlocks)
-        return; // There wasn't enough memory for the neighbor list.
-    for (int block = get_global_id(0); block < numBlocks; block += get_global_size(0)) {
-        // Load parameters for atom1.
-        int atom1 = neighborIndex[block];
-        int index1 = sortedAtoms[atom1];
-        AtomData data1;
-        loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
-        real3 force1 = 0.0f;
-        real3 torque1 = 0.0f;
-        for (int indexInBlock = 0; indexInBlock < NEIGHBOR_BLOCK_SIZE; indexInBlock++) {
-            // Load parameters for atom2.
-            int atom2 = neighbors[NEIGHBOR_BLOCK_SIZE*block+indexInBlock];
-            if (atom2 == -1)
-                continue;
-            int index2 = sortedAtoms[atom2];
-            AtomData data2;
-            loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
-            real3 force2 = 0.0f;
-            real3 torque2 = 0.0f;
-            // Compute the interaction.
-            real3 delta = data1.pos-data2.pos;
-#ifdef USE_PERIODIC
-            APPLY_PERIODIC_TO_DELTA(delta)
-#endif
-            real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-            real sigma = data1.sig.x+data2.sig.x;
-            real epsilon = data1.eps.x*data2.eps.x;
-            computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
-            atom_add(&forceBuffers[index2], (long) (force2.x*0x100000000));
-            atom_add(&forceBuffers[index2+PADDED_NUM_ATOMS], (long) (force2.y*0x100000000));
-            atom_add(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (long) (force2.z*0x100000000));
-            atom_add(&torqueBuffers[index2], (long) (torque2.x*0x100000000));
-            atom_add(&torqueBuffers[index2+PADDED_NUM_ATOMS], (long) (torque2.y*0x100000000));
-            atom_add(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (long) (torque2.z*0x100000000));
-        }
-        atom_add(&forceBuffers[index1], (long) (force1.x*0x100000000));
-        atom_add(&forceBuffers[index1+PADDED_NUM_ATOMS], (long) (force1.y*0x100000000));
-        atom_add(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (long) (force1.z*0x100000000));
-        atom_add(&torqueBuffers[index1], (long) (torque1.x*0x100000000));
-        atom_add(&torqueBuffers[index1+PADDED_NUM_ATOMS], (long) (torque1.y*0x100000000));
-        atom_add(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (long) (torque1.z*0x100000000));
-    }
-#else
-    for (int atom1 = get_global_id(0); atom1 < numAtoms; atom1 += get_global_size(0)) {
-        // Load parameters for atom1.
-        int index1 = sortedAtoms[atom1];
-        AtomData data1;
-        loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
-        real3 force1 = 0.0f;
-        real3 torque1 = 0.0f;
-        int nextExclusion = exclusionStartIndex[atom1];
-        int lastExclusion = exclusionStartIndex[atom1+1];
-        for (int atom2 = atom1+1; atom2 < numAtoms; atom2++) {
-            // Skip over excluded interactions.
-            if (nextExclusion < lastExclusion && exclusions[nextExclusion] == atom2) {
-                nextExclusion++;
-                continue;
-            }
-            // Load parameters for atom2.
-            int index2 = sortedAtoms[atom2];
-            AtomData data2;
-            loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
-            real3 force2 = 0.0f;
-            real3 torque2 = 0.0f;
-            // Compute the interaction.
-            real3 delta = data1.pos-data2.pos;
-            real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-            real sigma = data1.sig.x+data2.sig.x;
-            real epsilon = data1.eps.x*data2.eps.x;
-            computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
-            atom_add(&forceBuffers[index2], (long) (force2.x*0x100000000));
-            atom_add(&forceBuffers[index2+PADDED_NUM_ATOMS], (long) (force2.y*0x100000000));
-            atom_add(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (long) (force2.z*0x100000000));
-            atom_add(&torqueBuffers[index2], (long) (torque2.x*0x100000000));
-            atom_add(&torqueBuffers[index2+PADDED_NUM_ATOMS], (long) (torque2.y*0x100000000));
-            atom_add(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (long) (torque2.z*0x100000000));
-        }
-        atom_add(&forceBuffers[index1], (long) (force1.x*0x100000000));
-        atom_add(&forceBuffers[index1+PADDED_NUM_ATOMS], (long) (force1.y*0x100000000));
-        atom_add(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (long) (force1.z*0x100000000));
-        atom_add(&torqueBuffers[index1], (long) (torque1.x*0x100000000));
-        atom_add(&torqueBuffers[index1+PADDED_NUM_ATOMS], (long) (torque1.y*0x100000000));
-        atom_add(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (long) (torque1.z*0x100000000));
-    }
-#endif
-    // Now compute exceptions.
-    for (int index = get_global_id(0); index < numExceptions; index += get_global_size(0)) {
-        int4 atomIndices = exceptionParticles[index];
-        float2 params = exceptionParams[index];
-        int index1 = atomIndices.x, index2 = atomIndices.y;
-        int atom1 = atomIndices.z, atom2 = atomIndices.w;
-        AtomData data1, data2;
-        loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
-        loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
-        real3 force1 = 0, force2 = 0;
-        real3 torque1 = 0, torque2 = 0;
-        real3 delta = data1.pos-data2.pos;
-        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
-#ifdef USE_CUTOFF
-        if (r2 < CUTOFF_SQUARED) {
-#endif
-            computeOneInteraction(&data1, &data2, params.x, params.y, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
-            atom_add(&forceBuffers[index1], (long) (force1.x*0x100000000));
-            atom_add(&forceBuffers[index1+PADDED_NUM_ATOMS], (long) (force1.y*0x100000000));
-            atom_add(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (long) (force1.z*0x100000000));
-            atom_add(&forceBuffers[index2], (long) (force2.x*0x100000000));
-            atom_add(&forceBuffers[index2+PADDED_NUM_ATOMS], (long) (force2.y*0x100000000));
-            atom_add(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (long) (force2.z*0x100000000));
-            atom_add(&torqueBuffers[index1], (long) (torque1.x*0x100000000));
-            atom_add(&torqueBuffers[index1+PADDED_NUM_ATOMS], (long) (torque1.y*0x100000000));
-            atom_add(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (long) (torque1.z*0x100000000));
-            atom_add(&torqueBuffers[index2], (long) (torque2.x*0x100000000));
-            atom_add(&torqueBuffers[index2+PADDED_NUM_ATOMS], (long) (torque2.y*0x100000000));
-            atom_add(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (long) (torque2.z*0x100000000));
-#ifdef USE_CUTOFF
-        }
-#endif
-    }
-    energyBuffer[get_global_id(0)] += energy;
-}
-/**
- * Convert the torques to forces on the connected particles.
- */
-__kernel void applyTorques(
-        __global long* restrict forceBuffers, __global long* restrict torqueBuffers,
-        int numParticles, __global const real4* restrict posq, __global int2* const restrict axisParticleIndices,
-        __global const int* sortedParticles) {
-    const unsigned int warp = get_global_id(0)/TILE_SIZE;
-    for (int sortedIndex = get_global_id(0); sortedIndex < numParticles; sortedIndex += get_global_size(0)) {
-        int originalIndex = sortedParticles[sortedIndex];
-        real3 pos = posq[originalIndex].xyz;
-        int2 axisParticles = axisParticleIndices[originalIndex];
-        if (axisParticles.x != -1) {
-            // Load the torque.
-            real scale = 1/(real) 0x100000000;
-            real3 torque = (real3) (scale*torqueBuffers[originalIndex], scale*torqueBuffers[originalIndex+PADDED_NUM_ATOMS], scale*torqueBuffers[originalIndex+2*PADDED_NUM_ATOMS]);
-            real3 force = 0, xforce = 0, yforce = 0;
-            // Apply a force to the x particle.
-            real3 dx = posq[axisParticles.x].xyz-pos;
-            real dx2 = dot(dx, dx);
-            real3 f = cross(torque, dx)/dx2;
-            xforce += f;
-            force -= f;
-            if (axisParticles.y != -1) {
-                // Apply a force to the y particle.  This is based on the component of the torque
-                // that was not already applied to the x particle.
-                real3 dy = posq[axisParticles.y].xyz-pos;
-                real dy2 = dot(dy, dy);
-                real3 torque2 = dx*dot(torque, dx)/dx2;
-                f = cross(torque2, dy)/dy2;
-                yforce += f;
-                force -= f;
-            }
-            atom_add(&forceBuffers[originalIndex], (long) (force.x*0x100000000));
-            atom_add(&forceBuffers[originalIndex+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
-            atom_add(&forceBuffers[originalIndex+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
-            atom_add(&forceBuffers[axisParticles.x], (long) (xforce.x*0x100000000));
-            atom_add(&forceBuffers[axisParticles.x+PADDED_NUM_ATOMS], (long) (xforce.y*0x100000000));
-            atom_add(&forceBuffers[axisParticles.x+2*PADDED_NUM_ATOMS], (long) (xforce.z*0x100000000));
-            if (axisParticles.y != -1) {
-                atom_add(&forceBuffers[axisParticles.y], (long) (yforce.x*0x100000000));
-                atom_add(&forceBuffers[axisParticles.y+PADDED_NUM_ATOMS], (long) (yforce.y*0x100000000));
-                atom_add(&forceBuffers[axisParticles.y+2*PADDED_NUM_ATOMS], (long) (yforce.z*0x100000000));
-            }
-        }
-    }
-}
--- a/platforms/opencl/src/kernels/gbsaObc2.cl
+++ b/platforms/opencl/src/kernels/gbsaObc2.cl
-{
-    real invRSquaredOver4 = 0.25f*invR*invR;
-    real rScaledRadiusJ = r+OBC_PARAMS2.y;
-    real rScaledRadiusI = r+OBC_PARAMS1.y;
-    real l_ijJ = RECIP(max((real) OBC_PARAMS1.x, fabs(r-OBC_PARAMS2.y)));
-    real l_ijI = RECIP(max((real) OBC_PARAMS2.x, fabs(r-OBC_PARAMS1.y)));
-    real u_ijJ = RECIP(rScaledRadiusJ);
-    real u_ijI = RECIP(rScaledRadiusI);
-    real l_ij2J = l_ijJ*l_ijJ;
-    real l_ij2I = l_ijI*l_ijI;
-    real u_ij2J = u_ijJ*u_ijJ;
-    real u_ij2I = u_ijI*u_ijI;
-    real t1J = LOG(u_ijJ*RECIP(l_ijJ));
-    real t1I = LOG(u_ijI*RECIP(l_ijI));
-    real t2J = (l_ij2J-u_ij2J);
-    real t2I = (l_ij2I-u_ij2I);
-    real term1 = (0.5f*(0.25f+OBC_PARAMS2.y*OBC_PARAMS2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
-    real term2 = (0.5f*(0.25f+OBC_PARAMS1.y*OBC_PARAMS1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
-    real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1 : (real) 0);
-    tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2 : (real) 0);
-#ifdef USE_CUTOFF
-    bool includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
-#else
-    bool includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2);
-#endif
-    dEdR += (includeInteraction ? tempdEdR : (real) 0);
-}
--- a/platforms/opencl/src/kernels/harmonicAngleForce.cl
+++ b/platforms/opencl/src/kernels/harmonicAngleForce.cl
-float2 angleParams = PARAMS[index];
-real deltaIdeal = theta-angleParams.x;
-energy += 0.5f*angleParams.y*deltaIdeal*deltaIdeal;
-real dEdAngle = angleParams.y*deltaIdeal;
--- a/platforms/opencl/src/kernels/harmonicBondForce.cl
+++ b/platforms/opencl/src/kernels/harmonicBondForce.cl
-float2 bondParams = PARAMS[index];
-real deltaIdeal = r-bondParams.x;
-energy += 0.5f * bondParams.y*deltaIdeal*deltaIdeal;
-real dEdR = bondParams.y * deltaIdeal;
--- a/platforms/opencl/src/kernels/integrationUtilities.cl
+++ b/platforms/opencl/src/kernels/integrationUtilities.cl
-/**
- * Apply a time shift to the velocities before computing kinetic energy.
- */
-__kernel void timeShiftVelocities(__global mixed4* restrict velm, __global const real4* restrict force, real timeShift) {
-    for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
-        mixed4 velocity = velm[index];
-        if (velocity.w != 0.0) {
-            mixed4 f = convert_mixed4(force[index]);
-            velocity.xyz += timeShift*f.xyz*velocity.w;
-            velm[index] = velocity;
-        }
-    }
-}
\ No newline at end of file
--- a/platforms/opencl/src/kernels/langevin.cl
+++ b/platforms/opencl/src/kernels/langevin.cl
-enum {VelScale, ForceScale, NoiseScale, MaxParams};
-/**
- * Perform the first step of Langevin integration.
- */
-__kernel void integrateLangevinPart1(__global mixed4* restrict velm, __global const real4* restrict force, __global mixed4* restrict posDelta,
-        __global const mixed* restrict paramBuffer, __global const mixed2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) {
-    mixed vscale = paramBuffer[VelScale];
-    mixed fscale = paramBuffer[ForceScale];
-    mixed noisescale = paramBuffer[NoiseScale];
-    mixed stepSize = dt[0].y;
-    int index = get_global_id(0);
-    randomIndex += index;
-    while (index < NUM_ATOMS) {
-        mixed4 velocity = velm[index];
-        if (velocity.w != 0.0) {
-            mixed sqrtInvMass = sqrt(velocity.w);
-            velocity.x = vscale*velocity.x + fscale*velocity.w*force[index].x + noisescale*sqrtInvMass*random[randomIndex].x;
-            velocity.y = vscale*velocity.y + fscale*velocity.w*force[index].y + noisescale*sqrtInvMass*random[randomIndex].y;
-            velocity.z = vscale*velocity.z + fscale*velocity.w*force[index].z + noisescale*sqrtInvMass*random[randomIndex].z;
-            velm[index] = velocity;
-            posDelta[index] = stepSize*velocity;
-        }
-        randomIndex += get_global_size(0);
-        index += get_global_size(0);
-    }
-}
-/**
- * Perform the second step of Langevin integration.
- */
-__kernel void integrateLangevinPart2(__global real4* restrict posq, __global real4* restrict posqCorrection, __global const mixed4* restrict posDelta, __global mixed4* restrict velm, __global const mixed2* restrict dt) {
-#ifdef SUPPORTS_DOUBLE_PRECISION
-    double invStepSize = 1.0/dt[0].y;
-#else
-    float invStepSize = 1.0f/dt[0].y;
-    float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
-#endif
-    int index = get_global_id(0);
-    while (index < NUM_ATOMS) {
-        mixed4 vel = velm[index];
-        if (vel.w != 0.0) {
-#ifdef USE_MIXED_PRECISION
-            real4 pos1 = posq[index];
-            real4 pos2 = posqCorrection[index];
-            mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
-#else
-            real4 pos = posq[index];
-#endif
-            mixed4 delta = posDelta[index];
-            pos.xyz += delta.xyz;
-#ifdef SUPPORTS_DOUBLE_PRECISION
-            vel.xyz = convert_mixed4(invStepSize*convert_double4(delta)).xyz;
-#else
-            vel.xyz = invStepSize*delta.xyz + correction*delta.xyz;
-#endif
-#ifdef USE_MIXED_PRECISION
-            posq[index] = convert_real4(pos);
-            posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
-#else
-            posq[index] = pos;
-#endif
-            velm[index] = vel;
-        }
-        index += get_global_size(0);
-    }
-}
-/**
- * Select the step size to use for the next step.
- */
-__kernel void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed friction, mixed kT, __global mixed2* restrict dt,
-        __global const mixed4* restrict velm, __global const real4* restrict force, __global mixed* restrict paramBuffer, __local mixed* restrict params, __local mixed* restrict error) {
-    // Calculate the error.
-    mixed err = 0.0f;
-    unsigned int index = get_local_id(0);
-    while (index < NUM_ATOMS) {
-        real4 f = force[index];
-        mixed invMass = velm[index].w;
-        err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass*invMass;
-        index += get_global_size(0);
-    }
-    error[get_local_id(0)] = err;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    // Sum the errors from all threads.
-    for (unsigned int offset = 1; offset < get_local_size(0); offset *= 2) {
-        if (get_local_id(0)+offset < get_local_size(0) && (get_local_id(0)&(2*offset-1)) == 0)
-            error[get_local_id(0)] += error[get_local_id(0)+offset];
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (get_global_id(0) == 0) {
-        // Select the new step size.
-        mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
-        mixed newStepSize = sqrt(errorTol/totalError);
-        mixed oldStepSize = dt[0].y;
-        if (oldStepSize > 0.0f)
-            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
-        if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
-            newStepSize = oldStepSize; // Keeping dt constant between steps improves the behavior of the integrator.
-        if (newStepSize > maxStepSize)
-            newStepSize = maxStepSize;
-        dt[0].y = newStepSize;
-        // Recalculate the integration parameters.
-        mixed vscale = exp(-newStepSize*friction);
-        mixed fscale = (friction == 0 ? newStepSize : (1-vscale)/friction);
-        mixed noisescale = sqrt(kT*(1-vscale*vscale));
-        params[VelScale] = vscale;
-        params[ForceScale] = fscale;
-        params[NoiseScale] = noisescale;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (get_local_id(0) < MaxParams)
-        paramBuffer[get_local_id(0)] = params[get_local_id(0)];
-}
--- a/platforms/opencl/src/kernels/nonbondedExceptions.cl
+++ b/platforms/opencl/src/kernels/nonbondedExceptions.cl
 float4 exceptionParams = PARAMS[index];
 real4 delta = pos2-pos1;
+#if APPLY_PERIODIC
+APPLY_PERIODIC_TO_DELTA(delta)
+#endif
 real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 real invR = RSQRT(r2);
 real sig2 = invR*exceptionParams.y;

--- a/platforms/opencl/src/kernels/noseHooverChain.cl
+++ b/platforms/opencl/src/kernels/noseHooverChain.cl
+//#include <initializer_list>
+__kernel void propagateNoseHooverChain(__global mixed2* restrict chainData, __global const mixed2 * restrict energySum, __global mixed2* restrict scaleFactor,
+                                                    __global mixed* restrict chainMasses, __global mixed* restrict chainForces, 
+                                                    int chainType, int chainLength, int numMTS, int numDOFs, float timeStep,
+                                                    mixed kT, float frequency){
+    const mixed kineticEnergy = chainType == 0 ? energySum[0].x : energySum[0].y;
+    mixed scale = 1;
+    if(kineticEnergy < 1e-8) return;
+    for (int bead = 0; bead < chainLength; ++bead) chainMasses[bead] = kT / (frequency * frequency);
+    chainMasses[0] *= numDOFs;
+    mixed KE2 = 2.0f * kineticEnergy;
+    mixed timeOverMTS = timeStep / numMTS;
+    chainForces[0] = (KE2 - numDOFs * kT) / chainMasses[0];
+    for (int bead = 0; bead < chainLength - 1; ++bead) {
+        chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
+    }
+    for (int mts = 0; mts < numMTS; ++mts) {
+        BEGIN_YS_LOOP
+            mixed wdt = ys * timeOverMTS;
+            chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
+            for (int bead = chainLength - 2; bead >= 0; --bead) {
+                mixed aa = EXP(-0.125f * wdt * chainData[bead + 1].y);
+                chainData[bead].y = aa * (chainData[bead].y * aa + 0.25f * wdt * chainForces[bead]);
+            }
+            // update particle velocities
+            mixed aa = EXP(-0.5f * wdt * chainData[0].y);
+            scale *= aa;
+            // update the thermostat positions
+            for (int bead = 0; bead < chainLength; ++bead) {
+                chainData[bead].x += 0.5f * chainData[bead].y * wdt;
+            }
+            // update the forces
+            chainForces[0] = (scale * scale * KE2 - numDOFs * kT) / chainMasses[0];
+            // update thermostat velocities
+            for (int bead = 0; bead < chainLength - 1; ++bead) {
+                mixed aa = EXP(-0.125f * wdt * chainData[bead + 1].y);
+                chainData[bead].y = aa * (aa * chainData[bead].y + 0.25f * wdt * chainForces[bead]);
+                chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
+            }
+            chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
+        END_YS_LOOP
+    } // MTS loop
+    if (chainType == 0) {
+        scaleFactor[0].x = scale;
+    } else {
+        scaleFactor[0].y = scale;
+    }
+}
+/**
+ * Compute total (potential + kinetic) energy of the Nose-Hoover beads
+ */
+__kernel void computeHeatBathEnergy(__global mixed* restrict heatBathEnergy, int chainLength, int numDOFs,
+                                                 mixed kT, float frequency, __global const mixed2* restrict chainData){
+    // Note that this is always incremented; make sure it's zeroed properly before the first call
+    for(int i = 0; i < chainLength; ++i) {
+        mixed prefac = i ? 1 : numDOFs;
+        mixed mass = prefac * kT / (frequency * frequency);
+        mixed velocity = chainData[i].y; 
+        // The kinetic energy of this bead
+        heatBathEnergy[0] += 0.5f * mass * velocity * velocity;
+        // The potential energy of this bead
+        mixed position = chainData[i].x;
+        heatBathEnergy[0] += prefac * kT * position;
+    }
+}
+__kernel void computeAtomsKineticEnergy(__global mixed2 * restrict energyBuffer, int numAtoms,
+                                        __global const mixed4* restrict velm, __global const int *restrict atoms){
+    mixed2 energy = (mixed2) (0,0);
+    //energy = 1; return;
+    int index = get_global_id(0);
+    while (index < numAtoms){
+        int atom = atoms[index];
+        mixed4 v = velm[atom];
+        mixed mass = v.w == 0 ? 0 : 1 / v.w;
+        energy.x += 0.5f * mass * (v.x*v.x + v.y*v.y + v.z*v.z);
+        index += get_global_size(0);
+    }
+    energyBuffer[get_global_id(0)] = energy;
+}
+__kernel void computePairsKineticEnergy(__global mixed2 * restrict energyBuffer, int numPairs,
+                                        __global const mixed4* restrict velm, __global const int2 *restrict pairs){
+    mixed2 energy = (mixed2) (0,0);
+    int index = get_global_id(0);
+    while (index < numPairs){
+        int2 pair = pairs[index];
+        int atom1 = pair.x;
+        int atom2 = pair.y;
+        mixed4 v1 = velm[atom1];
+        mixed4 v2 = velm[atom2];
+        mixed m1 = v1.w == 0 ? 0 : 1 / v1.w;
+        mixed m2 = v2.w == 0 ? 0 : 1 / v2.w;
+        mixed4 cv;
+        cv.x = (m1*v1.x + m2*v2.x) / (m1 + m2);
+        cv.y = (m1*v1.y + m2*v2.y) / (m1 + m2);
+        cv.z = (m1*v1.z + m2*v2.z) / (m1 + m2);
+        mixed4 rv;
+        rv.x = v2.x - v1.x;
+        rv.y = v2.y - v1.y;
+        rv.z = v2.z - v1.z;
+        energy.x += 0.5f * (m1 + m2) * (cv.x*cv.x + cv.y*cv.y + cv.z*cv.z);
+        energy.y += 0.5f * (m1 * m2 / (m1 + m2)) * (rv.x*rv.x + rv.y*rv.y + rv.z*rv.z);
+        index += get_global_size(0);
+    }
+    // The atoms version of this has been called already, so accumulate instead of assigning here
+    energyBuffer[get_global_id(0)].xy += energy.xy;
+}
+__kernel void scaleAtomsVelocities(__global mixed2* restrict scaleFactor, int numAtoms,
+                                   __global mixed4* restrict velm, __global const int *restrict atoms){
+    const mixed scale = scaleFactor[0].x;
+    int index = get_global_id(0);
+    while (index < numAtoms){
+        int atom = atoms[index];
+        velm[atom].x *= scale;
+        velm[atom].y *= scale;
+        velm[atom].z *= scale;
+        index += get_global_size(0);
+    }
+}
+__kernel void scalePairsVelocities(__global mixed2 * restrict scaleFactor, int numPairs,
+                                   __global mixed4* restrict velm, __global const int2 *restrict pairs){
+    int index = get_global_id(0);
+    while (index < numPairs){
+        int atom1 = pairs[index].x;
+        int atom2 = pairs[index].y;
+        mixed m1 = velm[atom1].w == 0 ? 0 : 1 / velm[atom1].w;
+        mixed m2 = velm[atom2].w == 0 ? 0 : 1 / velm[atom2].w;
+        mixed4 cv;
+        cv.xyz = (m1*velm[atom1].xyz + m2*velm[atom2].xyz) / (m1 + m2);
+        mixed4 rv;
+        rv.xyz = velm[atom2].xyz - velm[atom1].xyz;
+        velm[atom1].x = scaleFactor[0].x * cv.x - scaleFactor[0].y * rv.x * m2 / (m1 + m2);
+        velm[atom1].y = scaleFactor[0].x * cv.y - scaleFactor[0].y * rv.y * m2 / (m1 + m2);
+        velm[atom1].z = scaleFactor[0].x * cv.z - scaleFactor[0].y * rv.z * m2 / (m1 + m2);
+        velm[atom2].x = scaleFactor[0].x * cv.x + scaleFactor[0].y * rv.x * m1 / (m1 + m2);
+        velm[atom2].y = scaleFactor[0].x * cv.y + scaleFactor[0].y * rv.y * m1 / (m1 + m2);
+        velm[atom2].z = scaleFactor[0].x * cv.z + scaleFactor[0].y * rv.z * m1 / (m1 + m2);
+        index += get_global_size(0);
+    }
+}
+/**
+ * Sum the energy buffer containing a pair of energies stored as mixed2.  This is copied from utilities.cu with small modifications
+ */
+__kernel void reduceEnergyPair(__global const mixed2* restrict energyBuffer, __global mixed2* restrict result, int bufferSize, int workGroupSize, __local mixed2* restrict tempBuffer) {
+    const unsigned int thread = get_local_id(0);
+    mixed2 sum = (mixed2) (0,0);
+    for (unsigned int index = thread; index < bufferSize; index += get_local_size(0)) {
+        sum.xy += energyBuffer[index].xy;
+    }
+    tempBuffer[thread].xy = sum.xy;
+    for (int i = 1; i < workGroupSize; i *= 2) {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (thread%(i*2) == 0 && thread+i < workGroupSize) {
+            tempBuffer[thread].xy += tempBuffer[thread+i].xy;
+        }
+    }
+    if (thread == 0) {
+        *result = tempBuffer[0];
+    }
+}
--- a/platforms/opencl/src/kernels/periodicTorsionForce.cl
+++ b/platforms/opencl/src/kernels/periodicTorsionForce.cl
-float4 torsionParams = PARAMS[index];
-real deltaAngle = torsionParams.z*theta-torsionParams.y;
-energy += torsionParams.x*(1.0f+cos(deltaAngle));
-real sinDeltaAngle = sin(deltaAngle);
-real dEdAngle = -torsionParams.x*torsionParams.z*sinDeltaAngle;
--- a/platforms/opencl/src/kernels/random.cl
+++ b/platforms/opencl/src/kernels/random.cl
-/**
- * Generate random numbers
- */
-__kernel void generateRandomNumbers(int numValues, __global float4* restrict random, __global uint4* restrict seed) {
-    int index = get_global_id(0);
-    uint4 state = seed[index];
-    unsigned int carry = 0;
-    while (index < numValues) {
-        float4 value;
-        // Generate first two values.
-        state.x = state.x * 69069 + 1;
-        state.y ^= state.y << 13;
-        state.y ^= state.y >> 17;
-        state.y ^= state.y << 5;
-        unsigned int k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-        unsigned int m = state.w + state.w + state.z + carry;
-        state.z = state.w;
-        state.w = m;
-        carry = k >> 30;
-        float x1 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
-        state.x = state.x * 69069 + 1;
-        state.y ^= state.y << 13;
-        state.y ^= state.y >> 17;
-        state.y ^= state.y << 5;
-        x1 = SQRT(-2.0f * LOG(x1));
-        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-        m = state.w + state.w + state.z + carry;
-        state.z = state.w;
-        state.w = m;
-        carry = k >> 30;
-        float x2 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-        value.x = x1 * cos(2.0f * 3.14159265f * x2);
-        value.y = x1 * sin(2.0f * 3.14159265f * x2);
-        // Generate next two values.
-        state.x = state.x * 69069 + 1;
-        state.y ^= state.y << 13;
-        state.y ^= state.y >> 17;
-        state.y ^= state.y << 5;
-        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-        m = state.w + state.w + state.z + carry;
-        state.z = state.w;
-        state.w = m;
-        carry = k >> 30;
-        float x3 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
-        state.x = state.x * 69069 + 1;
-        state.y ^= state.y << 13;
-        state.y ^= state.y >> 17;
-        state.y ^= state.y << 5;
-        x3 = SQRT(-2.0f * LOG(x3));
-        k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
-        m = state.w + state.w + state.z + carry;
-        state.z = state.w;
-        state.w = m;
-        carry = k >> 30;
-        float x4 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
-        value.z = x3 * cos(2.0f * 3.14159265f * x4);
-        value.w = x3 * sin(2.0f * 3.14159265f * x4);
-        // Record the values.
-        random[index] = value;
-        index += get_global_size(0);
-    }
-    seed[get_global_id(0)] = state;
-}
--- a/platforms/opencl/src/kernels/rbTorsionForce.cl
+++ b/platforms/opencl/src/kernels/rbTorsionForce.cl
-float8 torsionParams = PARAMS[index];
-if (theta < 0.0f)
-    theta += PI;
-else
-    theta -= PI;
-cosangle = -cosangle;
-real cosFactor = cosangle;
-real dEdAngle = -torsionParams.s1;
-real rbEnergy = torsionParams.s0;
-rbEnergy += torsionParams.s1*cosFactor;
-dEdAngle -= 2.0f*torsionParams.s2*cosFactor;
-cosFactor *= cosangle;
-dEdAngle -= 3.0f*torsionParams.s3*cosFactor;
-rbEnergy += torsionParams.s2*cosFactor;
-cosFactor *= cosangle;
-dEdAngle -= 4.0f*torsionParams.s4*cosFactor;
-rbEnergy += torsionParams.s3*cosFactor;
-cosFactor *= cosangle;
-dEdAngle -= 5.0f*torsionParams.s5*cosFactor;
-rbEnergy += torsionParams.s4*cosFactor;
-rbEnergy += torsionParams.s5*cosFactor*cosangle;
-energy += rbEnergy;
-dEdAngle *= sin(theta);