Merged 5.1Optimizations branch back to trunk

93c467b2 · Peter Eastman · f6d4557d · 93c467b2 · 93c467b2 · 93c467b2
Commit 93c467b2 authored Mar 22, 2013 by Peter Eastman
20 changed files
--- a/platforms/cuda/src/kernels/langevin.cu
+++ b/platforms/cuda/src/kernels/langevin.cu
@@ -95,8 +95,8 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error
    if (blockIdx.x*blockDim.x+threadIdx.x == 0) {
        // Select the new step size.

-        mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
-        mixed newStepSize = sqrt(errorTol/totalError);
+        mixed totalError = SQRT(error[0]/(NUM_ATOMS*3));
+        mixed newStepSize = SQRT(errorTol/totalError);
        mixed oldStepSize = dt[0].y;
        if (oldStepSize > 0.0f)
            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
@@ -108,9 +108,9 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error

        // Recalculate the integration parameters.

-        mixed vscale = exp(-newStepSize/tau);
+        mixed vscale = EXP(-newStepSize/tau);
        mixed fscale = (1-vscale)*tau;
-        mixed noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau);
+        mixed noisescale = SQRT(2*kT/tau)*SQRT(0.5f*(1-vscale*vscale)*tau);
        params[VelScale] = vscale;
        params[ForceScale] = fscale;
        params[NoiseScale] = noisescale;

--- a/platforms/cuda/src/kernels/nonbonded.cu
+++ b/platforms/cuda/src/kernels/nonbonded.cu
--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
-extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4* __restrict__ pmeBsplineTheta, int2* __restrict__ pmeAtomGridIndex,
+extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex,
            real4 periodicBoxSize, real4 invPeriodicBoxSize) {
-    extern __shared__ real3 bsplinesCache[];
-    real3* data = &bsplinesCache[threadIdx.x*PME_ORDER];
-    const real3 scale = make_real3(RECIP(PME_ORDER-1));
+    // Compute the index of the grid point each atom is associated with.
+    
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
        real4 pos = posq[i];
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
@@ -11,11 +10,40 @@ extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4*
        real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
-        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
                                 ((int) t.y) % GRID_SIZE_Y,
                                 ((int) t.z) % GRID_SIZE_Z);
        pmeAtomGridIndex[i] = make_int2(i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
+    }
+}
+
+extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, const int2* __restrict__ pmeAtomGridIndex) {
+    real3 data[PME_ORDER];
+    const real scale = RECIP(PME_ORDER-1);
+    
+    // Process the atoms in spatially sorted order.  This improves efficiency when writing
+    // the grid values.
+    
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+        int atom = pmeAtomGridIndex[i].x;
+        real charge = posq[atom].w;
+        real3 force = make_real3(0);
+        real4 pos = posq[atom];
+        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
+        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
+        real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
+                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
+                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
+        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
+                                   ((int) t.y) % GRID_SIZE_Y,
+                                   ((int) t.z) % GRID_SIZE_Z);
+
+        // Since we need the full set of thetas, it's faster to compute them here than load them
+        // from global memory.
+        
+        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
        data[PME_ORDER-1] = make_real3(0);
        data[1] = dr;
        data[0] = make_real3(1)-dr;
@@ -23,101 +51,49 @@ extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4*
            real div = RECIP(j-1);
            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < (j-1); k++)
-                data[j-k-1] = div*((dr+make_real3(k)) *data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
+                data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
            data[0] = div*(make_real3(1)-dr)*data[0];
        }
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(make_real3(1)-dr)*data[0];
-        for (int j = 0; j < PME_ORDER; j++) {
-            real3 d = data[j]; // Copy it as a workaround for a bug in CUDA 5.0
-            pmeBsplineTheta[i+j*NUM_ATOMS] = make_real4(d.x, d.y, d.z, pos.w);  // Storing the charge here improves cache coherency in the charge spreading kernel
-        }
-    }
-}
        
-/**
- * For each grid point, find the range of sorted atoms associated with that point.
- */
-extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange, const real4* __restrict__ posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
-    int start = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x))/(blockDim.x*gridDim.x);
-    int end = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x+1))/(blockDim.x*gridDim.x);
-    int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
-    for (int i = start; i < end; ++i) {
-        int2 atomData = pmeAtomGridIndex[i];
-        int gridIndex = atomData.y;
-        if (gridIndex != last) {
-            for (int j = last+1; j <= gridIndex; ++j)
-                pmeAtomRange[j] = i;
-            last = gridIndex;
-        }
-    }
+        // Spread the charge from this atom onto each grid point.
         
-    // Fill in values beyond the last atom.
+        for (int ix = 0; ix < PME_ORDER; ix++) {
+            int xbase = gridIndex.x+ix;
+            xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+            xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
+            real dx = data[ix].x;
            
-    if (blockIdx.x == gridDim.x-1 && threadIdx.x == blockDim.x-1) {
-        int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
-        for (int j = last+1; j <= gridSize; ++j)
-            pmeAtomRange[j] = NUM_ATOMS;
-    }
-}
+            for (int iy = 0; iy < PME_ORDER; iy++) {
+                int ybase = gridIndex.y+iy;
+                ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
+                ybase = xbase + ybase*GRID_SIZE_Z;
+                real dy = data[iy].y;
                
-#define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)
-extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
-        const real4* __restrict__ pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
-    int ix = threadIdx.x/(PME_ORDER*PME_ORDER);
-    int remainder = threadIdx.x-ix*PME_ORDER*PME_ORDER;
-    int iy = remainder/PME_ORDER;
-    int iz = remainder-iy*PME_ORDER;
-    __shared__ real4 theta[PME_ORDER];
-    __shared__ real charge[BUFFER_SIZE];
-    __shared__ int basex[BUFFER_SIZE];
-    __shared__ int basey[BUFFER_SIZE];
-    __shared__ int basez[BUFFER_SIZE];
-    if (ix < PME_ORDER) {
-        for (int baseIndex = blockIdx.x*BUFFER_SIZE; baseIndex < NUM_ATOMS; baseIndex += gridDim.x*BUFFER_SIZE) {
-            // Load the next block of atoms into the buffers.
+                for (int iz = 0; iz < PME_ORDER; iz++) {
+                    int zindex = gridIndex.z+iz;
+                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+                    int index = ybase + zindex;

-            int atomIndex = baseIndex+threadIdx.x;
-            if (atomIndex < NUM_ATOMS) {
-                real4 pos = posq[atomIndex];
-                charge[threadIdx.x] = pos.w;
-                pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
-                pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
-                pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
-                basex[threadIdx.x] = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X);
-                basey[threadIdx.x] = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y);
-                basez[threadIdx.x] = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
-            }
-            __syncthreads();
-            int lastIndex = min(BUFFER_SIZE, NUM_ATOMS-baseIndex);
-            for (int index = 0; index < lastIndex; index++) {
-                int atomIndex = index+baseIndex;
-                if (threadIdx.x < PME_ORDER)
-                    theta[threadIdx.x] = pmeBsplineTheta[atomIndex+threadIdx.x*NUM_ATOMS];
-                __syncthreads();
-                real add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z;
-                int x = basex[index]+ix;
-                int y = basey[index]+iy;
-                int z = basez[index]+iz;
-                x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0);
-                y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
-                z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+                    real add = charge*dx*dy*data[iz].z;
 #ifdef USE_DOUBLE_PRECISION
                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
-                atomicAdd(&ulonglong_p[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z],  static_cast<unsigned long long>((long long) (add*0x100000000)));
+                    atomicAdd(&ulonglong_p[index],  static_cast<unsigned long long>((long long) (add*0x100000000)));
 #elif __CUDA_ARCH__ < 200
                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
-                int gridIndex = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z;
+                    int gridIndex = index;
                    gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2);
                    atomicAdd(&ulonglong_p[gridIndex],  static_cast<unsigned long long>((long long) (add*0x100000000)));
 #else
-                atomicAdd(&originalPmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], add*EPSILON_FACTOR);
+                    atomicAdd(&originalPmeGrid[index], add*EPSILON_FACTOR);
 #endif
                }
            }
        }
+    }
 }

 extern "C" __global__ void finishSpreadCharge(long long* __restrict__ originalPmeGrid) {
@@ -218,12 +194,16 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict__ e

 extern "C" __global__
 void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid,
-        real4 periodicBoxSize, real4 invPeriodicBoxSize) {
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, const int2* __restrict__ pmeAtomGridIndex) {
    real3 data[PME_ORDER];
    real3 ddata[PME_ORDER];
    const real scale = RECIP(PME_ORDER-1);
    
-    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
+    // Process the atoms in spatially sorted order.  This improves cache performance when loading
+    // the grid values.
+    
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+        int atom = pmeAtomGridIndex[i].x;
        real3 force = make_real3(0);
        real4 pos = posq[atom];
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
@@ -243,7 +223,6 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
        data[PME_ORDER-1] = make_real3(0);
        data[1] = dr;
        data[0] = make_real3(1)-dr;
-
        for (int j = 3; j < PME_ORDER; j++) {
            real div = RECIP(j-1);
            data[j-1] = div*dr*data[j-2];
@@ -252,11 +231,9 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
            data[0] = div*(make_real3(1)-dr)*data[0];
        }
        ddata[0] = -data[0];
-         
        for (int j = 1; j < PME_ORDER; j++)
            ddata[j] = data[j-1]-data[j];
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
-        
        for (int j = 1; j < (PME_ORDER-1); j++)
            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(make_real3(1)-dr)*data[0];

--- a/platforms/cuda/src/kernels/sort.cu
+++ b/platforms/cuda/src/kernels/sort.cu
@@ -4,6 +4,48 @@ __device__ KEY_TYPE getValue(DATA_TYPE value) {

 extern "C" {

+/**
+ * Sort a list that is short enough to entirely fit in local memory.  This is executed as
+ * a single thread block.
+ */
+__global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length) {
+    // Load the data into local memory.
+    
+    extern __shared__ DATA_TYPE dataBuffer[];
+    for (int index = threadIdx.x; index < length; index += blockDim.x)
+        dataBuffer[index] = data[index];
+    __syncthreads();
+
+    // Perform a bitonic sort in local memory.
+
+    for (unsigned int k = 2; k < 2*length; k *= 2) {
+        for (unsigned int j = k/2; j > 0; j /= 2) {
+            for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
+                int ixj = i^j;
+                if (ixj > i && ixj < length) {
+                    DATA_TYPE value1 = dataBuffer[i];
+                    DATA_TYPE value2 = dataBuffer[ixj];
+                    bool ascending = ((i&k) == 0);
+                    for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
+                        ascending = ((i&mask) == 0 ? !ascending : ascending);
+                    KEY_TYPE lowKey  = (ascending ? getValue(value1) : getValue(value2));
+                    KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
+                    if (lowKey > highKey) {
+                        dataBuffer[i] = value2;
+                        dataBuffer[ixj] = value1;
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    // Write the data back to global memory.
+
+    for (int index = threadIdx.x; index < length; index += blockDim.x)
+        data[index] = dataBuffer[index];
+}
+
 /**
 * Calculate the minimum and maximum value in the array to be sorted.  This kernel
 * is executed as a single work group.

--- a/platforms/cuda/src/kernels/torsionForce.cu
+++ b/platforms/cuda/src/kernels/torsionForce.cu
@@ -16,12 +16,12 @@ if (cosangle > 0.99f || cosangle < -0.99f) {
        theta = PI-theta;
 }
 else
-   theta = acos(cosangle);
+   theta = ACOS(cosangle);
 theta = (dot(v0, cp1) >= 0 ? theta : -theta);
 COMPUTE_FORCE
 real normCross1 = dot(cp0, cp0);
 real normSqrBC = dot(v1, v1);
-real normBC = sqrt(normSqrBC);
+real normBC = SQRT(normSqrBC);
 real normCross2 = dot(cp1, cp1);
 real dp = RECIP(normSqrBC);
 real4 ff = make_real4((-dEdAngle*normBC)/normCross1, dot(v0, v1)*dp, dot(v2, v1)*dp, (dEdAngle*normBC)/normCross2);

--- a/platforms/cuda/src/kernels/verlet.cu
+++ b/platforms/cuda/src/kernels/verlet.cu
@@ -93,8 +93,8 @@ extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTo
        __syncthreads();
    }
    if (threadIdx.x == 0) {
-        mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
-        mixed newStepSize = sqrt(errorTol/totalError);
+        mixed totalError = SQRT(error[0]/(NUM_ATOMS*3));
+        mixed newStepSize = SQRT(errorTol/totalError);
        mixed oldStepSize = dt[0].y;
        if (oldStepSize > 0.0f)
            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.

--- a/platforms/cuda/tests/TestCudaNonbondedForce.cpp
+++ b/platforms/cuda/tests/TestCudaNonbondedForce.cpp
@@ -438,9 +438,9 @@ void testLargeSystem() {
    }
    ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
 }
-
+/*
 void testBlockInteractions(bool periodic) {
-    const int blockSize = 32;
+    const int blockSize = CudaContext::TileSize;
    const int numBlocks = 100;
    const int numParticles = blockSize*numBlocks;
    const double cutoff = 1.0;
@@ -597,6 +597,8 @@ void testBlockInteractions(bool periodic) {
        if (!hasInteractions[i]) {
            unsigned int y = (unsigned int) std::floor(numBlocks+0.5-std::sqrt((numBlocks+0.5)*(numBlocks+0.5)-2*i));
            unsigned int x = (i-y*numBlocks+y*(y+1)/2);
+            if (x == y)
+                continue; // This block has exclusions, so it will not be in the neighbor list.
            for (int atom1 = 0; atom1 < blockSize; ++atom1) {
                double4 pos1 = posq[x*blockSize+atom1];
                for (int atom2 = 0; atom2 < blockSize; ++atom2) {
@@ -613,14 +615,14 @@ void testBlockInteractions(bool periodic) {
                }
            }
        }
-}
+}*/

 void testDispersionCorrection() {
    // Create a box full of identical particles.

    int gridSize = 5;
    int numParticles = gridSize*gridSize*gridSize;
-    double boxSize = gridSize*0.5;
+    double boxSize = gridSize*0.7;
    double cutoff = boxSize/3;
    System system;
    VerletIntegrator integrator(0.01);
@@ -822,8 +824,8 @@ int main(int argc, char* argv[]) {
        testCutoff14();
        testPeriodic();
        testLargeSystem();
-        testBlockInteractions(false);
-        testBlockInteractions(true);
+        //testBlockInteractions(false);
+        //testBlockInteractions(true);
        testDispersionCorrection();
        testChangingParameters();
        testParallelComputation(false);

--- a/platforms/cuda/tests/TestCudaSort.cpp
+++ b/platforms/cuda/tests/TestCudaSort.cpp
@@ -87,8 +87,7 @@ void verifySorting(vector<float> array) {
    ASSERT(elements1 == elements2);
 }

-void testUniformValues()
-{
+void testUniformValues() {
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);

@@ -98,8 +97,7 @@ void testUniformValues()
    verifySorting(array);
 }

-void testLogValues()
-{
+void testLogValues() {
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);

@@ -109,12 +107,23 @@ void testLogValues()
    verifySorting(array);
 }

+void testShortList() {
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    vector<float> array(500);
+    for (int i = 0; i < (int) array.size(); i++)
+        array[i] = (float) log(genrand_real2(sfmt));
+    verifySorting(array);
+}
+
 int main(int argc, char* argv[]) {
    try {
        if (argc > 1)
            platform.setPropertyDefaultValue("CudaPrecision", string(argv[1]));
        testUniformValues();
        testLogValues();
+        testShortList();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/opencl/src/OpenCLBondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLBondedUtilities.cpp
@@ -99,6 +99,18 @@ void OpenCLBondedUtilities::initialize(const System& system) {
            numBuffers[i] = max(numBuffers[i], bufferCounter[i][j]);
    }
    
+    // For efficiency, we want to merge multiple forces into a single kernel - but only if that
+    // won't increase the number of force buffers.
+    
+    if (context.getSupports64BitGlobalAtomics()) {
+        // Put all the forces in the same set.
+        
+        numForceBuffers = 1;
+        forceSets.push_back(vector<int>());
+        for (int i = 0; i < numForces; i++)
+            forceSets[0].push_back(i);
+    }
+    else {
        // Figure out how many force buffers will be required.
    
        for (int i = 0; i < numForces; i++)
@@ -107,8 +119,7 @@ void OpenCLBondedUtilities::initialize(const System& system) {
        if (context.getNonbondedUtilities().getHasInteractions())
            bufferLimit = max(bufferLimit, context.getNonbondedUtilities().getNumForceBuffers());
        
-    // For efficiency, we want to merge multiple forces into a single kernel - but only if that
-    // won't increase the number of force buffers.  Figure out sets of forces that can be merged.
+        // Figure out sets of forces that can be merged.
        
        vector<int> unmerged(numForces);
        for (int i = 0; i < numForces; i++)
@@ -137,6 +148,7 @@ void OpenCLBondedUtilities::initialize(const System& system) {
                unmerged.erase(unmerged.begin());
            unmerged.pop_back();
        }
+    }

    // Update the buffer indices based on merged sets.
    
@@ -162,9 +174,13 @@ void OpenCLBondedUtilities::initialize(const System& system) {
        const vector<int>& set = *iter;
        int setSize = set.size();
        stringstream s;
+        s<<"#ifdef SUPPORTS_64_BIT_ATOMICS\n";
+        s<<"#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
+        s<<"#endif\n";
        for (int i = 0; i < (int) prefixCode.size(); i++)
            s<<prefixCode[i];
-        s<<"__kernel void computeBondedForces(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, int groups";
+        string bufferType = (context.getSupports64BitGlobalAtomics() ? "long" : "real4");
+        s<<"__kernel void computeBondedForces(__global "<<bufferType<<"* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, int groups";
        for (int i = 0; i < setSize; i++) {
            int force = set[i];
            string indexType = "uint"+(indexWidth[force] == 1 ? "" : context.intToString(indexWidth[force]));
@@ -219,10 +235,17 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in
    s<<computeForce<<"\n";
    for (int i = 0; i < numAtoms; i++) {
        s<<"    {\n";
+        if (context.getSupports64BitGlobalAtomics()) {
+            s<<"    atom_add(&forceBuffers[atom"<<(i+1)<<"], (long) (force"<<(i+1)<<".x*0x100000000));\n";
+            s<<"    atom_add(&forceBuffers[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], (long) (force"<<(i+1)<<".y*0x100000000));\n";
+            s<<"    atom_add(&forceBuffers[atom"<<(i+1)<<"+2*PADDED_NUM_ATOMS], (long) (force"<<(i+1)<<".z*0x100000000));\n";
+        }
+        else {
            s<<"    unsigned int offset = atom"<<(i+1)<<"+buffers"<<suffix[i]<<"*PADDED_NUM_ATOMS;\n";
            s<<"    real4 force = forceBuffers[offset];\n";
            s<<"    force.xyz += force"<<(i+1)<<".xyz;\n";
            s<<"    forceBuffers[offset] = force;\n";
+        }
        s<<"    }\n";
    }
    s<<"}\n";
@@ -235,6 +258,9 @@ void OpenCLBondedUtilities::computeInteractions(int groups) {
        for (int i = 0; i < (int) forceSets.size(); i++) {
            int index = 0;
            cl::Kernel& kernel = kernels[i];
+            if (context.getSupports64BitGlobalAtomics())
+                kernel.setArg<cl::Buffer>(index++, context.getLongForceBuffer().getDeviceBuffer());
+            else
                kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
            kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
            kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -97,6 +97,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
            // Try to figure out which device is the fastest.

            int bestSpeed = -1;
+            bool bestSupportsDouble = false;
            for (int i = 0; i < (int) devices.size(); i++) {
                if (platformVendor == "Apple" && devices[i].getInfo<CL_DEVICE_VENDOR>() == "AMD")
                    continue; // Don't use AMD GPUs on OS X due to serious bugs.
@@ -135,9 +136,11 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
                    }
                }
                int speed = devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()*processingElementsPerComputeUnit*devices[i].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>();
-                if (maxSize >= minThreadBlockSize && speed > bestSpeed) {
+                bool supportsDouble = (devices[i].getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp64") != string::npos);
+                if (maxSize >= minThreadBlockSize && speed > bestSpeed && (supportsDouble || !bestSupportsDouble)) {
                    deviceIndex = i;
                    bestSpeed = speed;
+                    bestSupportsDouble = supportsDouble;
                }
            }
        }
@@ -173,9 +176,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
            }
        }
        else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") {
-            // Disable 64 bit atomics.  A future version of the driver will support them, but until we can test that,
-            // it's safest not to use them.
-            supports64BitGlobalAtomics = false;
            if (device.getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
                /// \todo Is 6 a good value for the OpenCL CPU device?
                // numThreadBlocksPerComputeUnit = ?;
@@ -190,14 +190,11 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
                    // check for errors.
                    try {
 #ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
-                        // AMD has both 32 and 64 width SIMDs. Can determine by using:
-                        // simdWidth = device.getInfo<CL_DEVICE_WAVEFRONT_WIDTH_AMD>();
                        // Must catch cl:Error as will fail if runtime does not support queries.
-                        // However, the 32 width NVIDIA kernels do not have all the necessary
-                        // barriers and so will not work for AMD.
-                        // So for now leave default of 1 which will use the default kernels.

                        cl_uint simdPerComputeUnit = device.getInfo<CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD>();
+                        simdWidth = device.getInfo<CL_DEVICE_WAVEFRONT_WIDTH_AMD>();
+
                        // If the GPU has multiple SIMDs per compute unit then it is uses the scalar instruction
                        // set instead of the VLIW instruction set. It therefore needs more thread blocks per
                        // compute unit to hide memory latency.
@@ -226,6 +223,10 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
            compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
        if (supportsDoublePrecision)
            compilationDefines["SUPPORTS_DOUBLE_PRECISION"] = "";
+        if (simdWidth >= 32)
+            compilationDefines["SYNC_WARPS"] = "";
+        else
+            compilationDefines["SYNC_WARPS"] = "barrier(CLK_LOCAL_MEM_FENCE)";
        vector<cl::Device> contextDevices;
        contextDevices.push_back(device);
        cl_context_properties cprops[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[platformIndex](), 0};

--- a/platforms/opencl/src/OpenCLFFT3D.cpp
+++ b/platforms/opencl/src/OpenCLFFT3D.cpp
@@ -36,27 +36,24 @@ using namespace OpenMM;
 using namespace std;

 OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize) : context(context), xsize(xsize), ysize(ysize), zsize(zsize) {
-    zkernel = createKernel(xsize, ysize, zsize);
-    xkernel = createKernel(ysize, zsize, xsize);
-    ykernel = createKernel(zsize, xsize, ysize);
+    zkernel = createKernel(xsize, ysize, zsize, zthreads);
+    xkernel = createKernel(ysize, zsize, xsize, xthreads);
+    ykernel = createKernel(zsize, xsize, ysize, ythreads);
 }

 void OpenCLFFT3D::execFFT(OpenCLArray& in, OpenCLArray& out, bool forward) {
-    int maxSize = xkernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
-    if (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU)
-        maxSize = 1;
    zkernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
    zkernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
    zkernel.setArg<cl_int>(2, forward ? 1 : -1);
-    context.executeKernel(zkernel, xsize*ysize*zsize, min(zsize, (int) maxSize));
+    context.executeKernel(zkernel, xsize*ysize*zsize, zthreads);
    xkernel.setArg<cl::Buffer>(0, out.getDeviceBuffer());
    xkernel.setArg<cl::Buffer>(1, in.getDeviceBuffer());
    xkernel.setArg<cl_int>(2, forward ? 1 : -1);
-    context.executeKernel(xkernel, xsize*ysize*zsize, min(xsize, (int) maxSize));
+    context.executeKernel(xkernel, xsize*ysize*zsize, xthreads);
    ykernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
    ykernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
    ykernel.setArg<cl_int>(2, forward ? 1 : -1);
-    context.executeKernel(ykernel, xsize*ysize*zsize, min(ysize, (int) maxSize));
+    context.executeKernel(ykernel, xsize*ysize*zsize, ythreads);
 }

 int OpenCLFFT3D::findLegalDimension(int minimum) {
@@ -66,7 +63,7 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
        // Attempt to factor the current value.

        int unfactored = minimum;
-        for (int factor = 2; factor < 6; factor++) {
+        for (int factor = 2; factor < 8; factor++) {
            while (unfactored > 1 && unfactored%factor == 0)
                unfactored /= factor;
        }
@@ -76,9 +73,10 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
    }
 }

-cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
+cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads) {
    bool loopRequired = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
    stringstream source;
+    int blocksPerGroup = (loopRequired ? 1 : max(1, 256/zsize));
    int stage = 0;
    int L = zsize;
    int m = 1;
@@ -88,22 +86,85 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
    while (L > 1) {
        int input = stage%2;
        int output = 1-input;
+        int radix;
+        if (L%7 == 0)
+            radix = 7;
+        else if (L%5 == 0)
+            radix = 5;
+        else if (L%4 == 0)
+            radix = 4;
+        else if (L%3 == 0)
+            radix = 3;
+        else if (L%2 == 0)
+            radix = 2;
+        else
+            throw OpenMMException("Illegal size for FFT: "+context.intToString(zsize));
        source<<"{\n";
-        if (L%5 == 0) {
-            L = L/5;
-            source<<"// Pass "<<(stage+1)<<" (radix 5)\n";
-            if (loopRequired)
+        L = L/radix;
+        source<<"// Pass "<<(stage+1)<<" (radix "<<radix<<")\n";
+        if (loopRequired) {
            source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
+            source<<"int base = i;\n";
+        }
        else {
-                source<<"if (get_local_id(0) < "<<(L*m)<<") {\n";
-                source<<"int i = get_local_id(0);\n";
+            source<<"if (get_local_id(0) < "<<(blocksPerGroup*L*m)<<") {\n";
+            source<<"int block = get_local_id(0)/"<<(L*m)<<";\n";
+            source<<"int i = get_local_id(0)-block*"<<(L*m)<<";\n";
+            source<<"int base = i+block*"<<zsize<<";\n";
        }
        source<<"int j = i/"<<m<<";\n";
-            source<<"real2 c0 = data"<<input<<"[i];\n";
-            source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"real2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"real2 c3 = data"<<input<<"[i+"<<(3*L*m)<<"];\n";
-            source<<"real2 c4 = data"<<input<<"[i+"<<(4*L*m)<<"];\n";
+        if (radix == 7) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
+            source<<"real2 c3 = data"<<input<<"[base+"<<(3*L*m)<<"];\n";
+            source<<"real2 c4 = data"<<input<<"[base+"<<(4*L*m)<<"];\n";
+            source<<"real2 c5 = data"<<input<<"[base+"<<(5*L*m)<<"];\n";
+            source<<"real2 c6 = data"<<input<<"[base+"<<(6*L*m)<<"];\n";
+            source<<"real2 d0 = c1+c6;\n";
+            source<<"real2 d1 = c1-c6;\n";
+            source<<"real2 d2 = c2+c5;\n";
+            source<<"real2 d3 = c2-c5;\n";
+            source<<"real2 d4 = c4+c3;\n";
+            source<<"real2 d5 = c4-c3;\n";
+            source<<"real2 d6 = d2+d0;\n";
+            source<<"real2 d7 = d5+d3;\n";
+            source<<"real2 b0 = c0+d6+d4;\n";
+            source<<"real2 b1 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)+cos(6*M_PI/7))/3-1)<<"*(d6+d4);\n";
+            source<<"real2 b2 = "<<context.doubleToString((2*cos(2*M_PI/7)-cos(4*M_PI/7)-cos(6*M_PI/7))/3)<<"*(d0-d4);\n";
+            source<<"real2 b3 = "<<context.doubleToString((cos(2*M_PI/7)-2*cos(4*M_PI/7)+cos(6*M_PI/7))/3)<<"*(d4-d2);\n";
+            source<<"real2 b4 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)-2*cos(6*M_PI/7))/3)<<"*(d2-d0);\n";
+            source<<"real2 b5 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n";
+            source<<"real2 b6 = -sign*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n";
+            source<<"real2 b7 = -sign*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n";
+            source<<"real2 b8 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n";
+            source<<"real2 t0 = b0+b1;\n";
+            source<<"real2 t1 = b2+b3;\n";
+            source<<"real2 t2 = b4-b3;\n";
+            source<<"real2 t3 = -b2-b4;\n";
+            source<<"real2 t4 = b6+b7;\n";
+            source<<"real2 t5 = b8-b7;\n";
+            source<<"real2 t6 = -b8-b6;\n";
+            source<<"real2 t7 = t0+t1;\n";
+            source<<"real2 t8 = t0+t2;\n";
+            source<<"real2 t9 = t0+t3;\n";
+            source<<"real2 t10 = (real2) (t4.y+b5.y, -(t4.x+b5.x));\n";
+            source<<"real2 t11 = (real2) (t5.y+b5.y, -(t5.x+b5.x));\n";
+            source<<"real2 t12 = (real2) (t6.y+b5.y, -(t6.x+b5.x));\n";
+            source<<"data"<<output<<"[base+6*j*"<<m<<"] = b0;\n";
+            source<<"data"<<output<<"[base+(6*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(7*L)<<"], t7-t10);\n";
+            source<<"data"<<output<<"[base+(6*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(7*L)<<"], t9-t12);\n";
+            source<<"data"<<output<<"[base+(6*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(7*L)<<"], t8+t11);\n";
+            source<<"data"<<output<<"[base+(6*j+4)*"<<m<<"] = multiplyComplex(w[j*"<<(4*zsize)<<"/"<<(7*L)<<"], t8-t11);\n";
+            source<<"data"<<output<<"[base+(6*j+5)*"<<m<<"] = multiplyComplex(w[j*"<<(5*zsize)<<"/"<<(7*L)<<"], t9+t12);\n";
+            source<<"data"<<output<<"[base+(6*j+6)*"<<m<<"] = multiplyComplex(w[j*"<<(6*zsize)<<"/"<<(7*L)<<"], t7+t10);\n";
+        }
+        else if (radix == 5) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
+            source<<"real2 c3 = data"<<input<<"[base+"<<(3*L*m)<<"];\n";
+            source<<"real2 c4 = data"<<input<<"[base+"<<(4*L*m)<<"];\n";
            source<<"real2 d0 = c1+c4;\n";
            source<<"real2 d1 = c2+c3;\n";
            source<<"real2 d2 = "<<context.doubleToString(sin(0.4*M_PI))<<"*(c1-c4);\n";
@@ -116,80 +177,45 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
            string coeff = context.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
            source<<"real2 d9 = sign*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
            source<<"real2 d10 = sign*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
-            source<<"data"<<output<<"[i+4*j*"<<m<<"] = c0+d4;\n";
-            source<<"data"<<output<<"[i+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
-            source<<"data"<<output<<"[i+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
-            source<<"data"<<output<<"[i+(4*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(5*L)<<"], d8-d10);\n";
-            source<<"data"<<output<<"[i+(4*j+4)*"<<m<<"] = multiplyComplex(w[j*"<<(4*zsize)<<"/"<<(5*L)<<"], d7-d9);\n";
-            source<<"}\n";
-            m = m*5;
+            source<<"data"<<output<<"[base+4*j*"<<m<<"] = c0+d4;\n";
+            source<<"data"<<output<<"[base+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
+            source<<"data"<<output<<"[base+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
+            source<<"data"<<output<<"[base+(4*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(5*L)<<"], d8-d10);\n";
+            source<<"data"<<output<<"[base+(4*j+4)*"<<m<<"] = multiplyComplex(w[j*"<<(4*zsize)<<"/"<<(5*L)<<"], d7-d9);\n";
        }
-        else if (L%4 == 0) {
-            L = L/4;
-            source<<"// Pass "<<(stage+1)<<" (radix 4)\n";
-            if (loopRequired)
-                source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
-            else {
-                source<<"if (get_local_id(0) < "<<(L*m)<<") {\n";
-                source<<"int i = get_local_id(0);\n";
-            }
-            source<<"int j = i/"<<m<<";\n";
-            source<<"real2 c0 = data"<<input<<"[i];\n";
-            source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"real2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
-            source<<"real2 c3 = data"<<input<<"[i+"<<(3*L*m)<<"];\n";
+        else if (radix == 4) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
+            source<<"real2 c3 = data"<<input<<"[base+"<<(3*L*m)<<"];\n";
            source<<"real2 d0 = c0+c2;\n";
            source<<"real2 d1 = c0-c2;\n";
            source<<"real2 d2 = c1+c3;\n";
            source<<"real2 d3 = sign*(real2) (c1.y-c3.y, c3.x-c1.x);\n";
-            source<<"data"<<output<<"[i+3*j*"<<m<<"] = d0+d2;\n";
-            source<<"data"<<output<<"[i+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
-            source<<"data"<<output<<"[i+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
-            source<<"data"<<output<<"[i+(3*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(4*L)<<"], d1-d3);\n";
-            source<<"}\n";
-            m = m*4;
+            source<<"data"<<output<<"[base+3*j*"<<m<<"] = d0+d2;\n";
+            source<<"data"<<output<<"[base+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
+            source<<"data"<<output<<"[base+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
+            source<<"data"<<output<<"[base+(3*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(4*L)<<"], d1-d3);\n";
        }
-        else if (L%3 == 0) {
-            L = L/3;
-            source<<"// Pass "<<(stage+1)<<" (radix 3)\n";
-            if (loopRequired)
-                source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
-            else {
-                source<<"if (get_local_id(0) < "<<(L*m)<<") {\n";
-                source<<"int i = get_local_id(0);\n";
-            }
-            source<<"int j = i/"<<m<<";\n";
-            source<<"real2 c0 = data"<<input<<"[i];\n";
-            source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"real2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
+        else if (radix == 3) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
            source<<"real2 d0 = c1+c2;\n";
            source<<"real2 d1 = c0-0.5f*d0;\n";
            source<<"real2 d2 = sign*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n";
-            source<<"data"<<output<<"[i+2*j*"<<m<<"] = c0+d0;\n";
-            source<<"data"<<output<<"[i+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
-            source<<"data"<<output<<"[i+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
-            source<<"}\n";
-            m = m*3;
+            source<<"data"<<output<<"[base+2*j*"<<m<<"] = c0+d0;\n";
+            source<<"data"<<output<<"[base+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
+            source<<"data"<<output<<"[base+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
        }
-        else if (L%2 == 0) {
-            L = L/2;
-            source<<"// Pass "<<(stage+1)<<" (radix 2)\n";
-            if (loopRequired)
-                source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
-            else {
-                source<<"if (get_local_id(0) < "<<(L*m)<<") {\n";
-                source<<"int i = get_local_id(0);\n";
+        else if (radix == 2) {
+            source<<"real2 c0 = data"<<input<<"[base];\n";
+            source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
+            source<<"data"<<output<<"[base+j*"<<m<<"] = c0+c1;\n";
+            source<<"data"<<output<<"[base+(j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(2*L)<<"], c0-c1);\n";
        }
-            source<<"int j = i/"<<m<<";\n";
-            source<<"real2 c0 = data"<<input<<"[i];\n";
-            source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
-            source<<"data"<<output<<"[i+j*"<<m<<"] = c0+c1;\n";
-            source<<"data"<<output<<"[i+(j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(2*L)<<"], c0-c1);\n";
        source<<"}\n";
-            m = m*2;
-        }
-        else
-            throw OpenMMException("Illegal size for FFT: "+context.intToString(zsize));
+        m = m*radix;
        source<<"barrier(CLK_LOCAL_MEM_FENCE);\n";
        source<<"}\n";
        ++stage;
@@ -202,20 +228,22 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
        source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z];\n";
    }
    else
-        source<<"out[y*(ZSIZE*XSIZE)+get_local_id(0)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n";
+        source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n";
    source<<"barrier(CLK_GLOBAL_MEM_FENCE);";
    map<string, string> replacements;
    replacements["XSIZE"] = context.intToString(xsize);
    replacements["YSIZE"] = context.intToString(ysize);
    replacements["ZSIZE"] = context.intToString(zsize);
+    replacements["BLOCKS_PER_GROUP"] = context.intToString(blocksPerGroup);
    replacements["M_PI"] = context.doubleToString(M_PI);
    replacements["COMPUTE_FFT"] = source.str();
    replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0");
    cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements));
    cl::Kernel kernel(program, "execFFT");
-    int bufferSize = zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
+    int bufferSize = blocksPerGroup*zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
    kernel.setArg(3, bufferSize, NULL);
    kernel.setArg(4, bufferSize, NULL);
    kernel.setArg(5, bufferSize, NULL);
+    threads = (loopRequired ? 1 : blocksPerGroup*zsize);
    return kernel;
 }
--- a/platforms/opencl/src/OpenCLFFT3D.h
+++ b/platforms/opencl/src/OpenCLFFT3D.h
@@ -81,8 +81,9 @@ public:
     */
    static int findLegalDimension(int minimum);
 private:
-    cl::Kernel createKernel(int xsize, int ysize, int zsize);
+    cl::Kernel createKernel(int xsize, int ysize, int zsize, int& threads);
    int xsize, ysize, zsize;
+    int xthreads, ythreads, zthreads;
    OpenCLContext& context;
    cl::Kernel xkernel, ykernel, zkernel;
 };

--- a/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
@@ -99,7 +99,7 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
        random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
        ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL),
-        ccmaConvergedBuffer(NULL), vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
+        vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false) {
    // Create workspace arrays.

@@ -479,8 +479,6 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
        ccmaNumAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms, "CcmaAtomConstraintsIndex");
        ccmaConstraintMatrixColumn = OpenCLArray::create<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
        ccmaConverged = OpenCLArray::create<cl_int>(context, 2, "CcmaConverged");
-        ccmaConvergedBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, 2*sizeof(cl_int));
-        ccmaConvergedMemory = (cl_int*) context.getQueue().enqueueMapBuffer(*ccmaConvergedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, 2*sizeof(cl_int));
        vector<mm_int2> atomsVec(ccmaAtoms->getSize());
        vector<cl_int> atomConstraintsVec(ccmaAtomConstraints->getSize());
        vector<cl_int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
@@ -660,24 +658,28 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
    defines["NUM_OUT_OF_PLANE"] = context.intToString(numOutOfPlane);
    cl::Program vsiteProgram = context.createProgram(OpenCLKernelSources::virtualSites, defines);
    vsitePositionKernel = cl::Kernel(vsiteProgram, "computeVirtualSites");
-    vsitePositionKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
-    setPosqCorrectionArg(context, vsitePositionKernel, 1);
-    vsitePositionKernel.setArg<cl::Buffer>(2, vsite2AvgAtoms->getDeviceBuffer());
-    vsitePositionKernel.setArg<cl::Buffer>(3, vsite2AvgWeights->getDeviceBuffer());
-    vsitePositionKernel.setArg<cl::Buffer>(4, vsite3AvgAtoms->getDeviceBuffer());
-    vsitePositionKernel.setArg<cl::Buffer>(5, vsite3AvgWeights->getDeviceBuffer());
-    vsitePositionKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneAtoms->getDeviceBuffer());
-    vsitePositionKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneWeights->getDeviceBuffer());
+    int index = 0;
+    vsitePositionKernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
+    if (context.getUseMixedPrecision())
+        vsitePositionKernel.setArg<cl::Buffer>(index++, context.getPosqCorrection().getDeviceBuffer());
+    vsitePositionKernel.setArg<cl::Buffer>(index++, vsite2AvgAtoms->getDeviceBuffer());
+    vsitePositionKernel.setArg<cl::Buffer>(index++, vsite2AvgWeights->getDeviceBuffer());
+    vsitePositionKernel.setArg<cl::Buffer>(index++, vsite3AvgAtoms->getDeviceBuffer());
+    vsitePositionKernel.setArg<cl::Buffer>(index++, vsite3AvgWeights->getDeviceBuffer());
+    vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneAtoms->getDeviceBuffer());
+    vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneWeights->getDeviceBuffer());
    vsiteForceKernel = cl::Kernel(vsiteProgram, "distributeForces");
-    vsiteForceKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
-    setPosqCorrectionArg(context, vsiteForceKernel, 1);
-    // Skip argument 2: the force array hasn't been created yet.
-    vsiteForceKernel.setArg<cl::Buffer>(3, vsite2AvgAtoms->getDeviceBuffer());
-    vsiteForceKernel.setArg<cl::Buffer>(4, vsite2AvgWeights->getDeviceBuffer());
-    vsiteForceKernel.setArg<cl::Buffer>(5, vsite3AvgAtoms->getDeviceBuffer());
-    vsiteForceKernel.setArg<cl::Buffer>(6, vsite3AvgWeights->getDeviceBuffer());
-    vsiteForceKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneAtoms->getDeviceBuffer());
-    vsiteForceKernel.setArg<cl::Buffer>(8, vsiteOutOfPlaneWeights->getDeviceBuffer());
+    index = 0;
+    vsiteForceKernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
+    index++; // Skip argument 1: the force array hasn't been created yet.
+    if (context.getUseMixedPrecision())
+        vsiteForceKernel.setArg<cl::Buffer>(index++, context.getPosqCorrection().getDeviceBuffer());
+    vsiteForceKernel.setArg<cl::Buffer>(index++, vsite2AvgAtoms->getDeviceBuffer());
+    vsiteForceKernel.setArg<cl::Buffer>(index++, vsite2AvgWeights->getDeviceBuffer());
+    vsiteForceKernel.setArg<cl::Buffer>(index++, vsite3AvgAtoms->getDeviceBuffer());
+    vsiteForceKernel.setArg<cl::Buffer>(index++, vsite3AvgWeights->getDeviceBuffer());
+    vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneAtoms->getDeviceBuffer());
+    vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneWeights->getDeviceBuffer());
    numVsites = num2Avg+num3Avg+numOutOfPlane;
 }

@@ -718,8 +720,6 @@ OpenCLIntegrationUtilities::~OpenCLIntegrationUtilities() {
        delete ccmaDelta2;
    if (ccmaConverged != NULL)
        delete ccmaConverged;
-    if (ccmaConvergedBuffer != NULL)
-        delete ccmaConvergedBuffer;
    if (vsite2AvgAtoms != NULL)
        delete vsite2AvgAtoms;
    if (vsite2AvgWeights != NULL)
@@ -807,6 +807,7 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
                ccmaDirectionsKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
            else
                ccmaDirectionsKernel.setArg<void*>(3, NULL);
+            ccmaDirectionsKernel.setArg<cl::Buffer>(4, ccmaConverged->getDeviceBuffer());
            ccmaForceKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
            ccmaForceKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
            ccmaForceKernel.setArg<cl::Buffer>(2, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
@@ -834,23 +835,19 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
        context.executeKernel(ccmaDirectionsKernel, ccmaAtoms->getSize());
        const int checkInterval = 4;
        cl::Event event;
+        int* converged = (int*) context.getPinnedBuffer();
        for (int i = 0; i < 150; i++) {
            ccmaForceKernel.setArg<cl_int>(7, i);
-            if (i == 0) {
-                ccmaConvergedMemory[0] = 1;
-                ccmaConvergedMemory[1] = 0;
-                context.getQueue().enqueueWriteBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(cl_int), ccmaConvergedMemory);
-            }
            context.executeKernel(ccmaForceKernel, ccmaAtoms->getSize());
            if ((i+1)%checkInterval == 0)
-                context.getQueue().enqueueReadBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(cl_int), ccmaConvergedMemory, NULL, &event);
+                context.getQueue().enqueueReadBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(cl_int), converged, NULL, &event);
            ccmaMultiplyKernel.setArg<cl_int>(5, i);
            context.executeKernel(ccmaMultiplyKernel, ccmaAtoms->getSize());
            ccmaUpdateKernel.setArg<cl_int>(8, i);
            context.executeKernel(ccmaUpdateKernel, context.getNumAtoms());
            if ((i+1)%checkInterval == 0) {
                event.wait();
-                if (ccmaConvergedMemory[i%2])
+                if (converged[i%2])
                    break;
            }
        }
@@ -864,7 +861,7 @@ void OpenCLIntegrationUtilities::computeVirtualSites() {

 void OpenCLIntegrationUtilities::distributeForcesFromVirtualSites() {
    if (numVsites > 0) {
-        vsiteForceKernel.setArg<cl::Buffer>(2, context.getForce().getDeviceBuffer());
+        vsiteForceKernel.setArg<cl::Buffer>(1, context.getForce().getDeviceBuffer());
        context.executeKernel(vsiteForceKernel, numVsites);
    }
 }

--- a/platforms/opencl/src/OpenCLIntegrationUtilities.h
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.h
@@ -141,8 +141,6 @@ private:
    OpenCLArray* ccmaDelta1;
    OpenCLArray* ccmaDelta2;
    OpenCLArray* ccmaConverged;
-    cl::Buffer* ccmaConvergedBuffer;
-    cl_int* ccmaConvergedMemory;
    OpenCLArray* vsite2AvgAtoms;
    OpenCLArray* vsite2AvgWeights;
    OpenCLArray* vsite3AvgAtoms;

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
--- a/platforms/opencl/src/OpenCLKernels.h
+++ b/platforms/opencl/src/OpenCLKernels.h
@@ -556,7 +556,7 @@ class OpenCLCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
 public:
    OpenCLCalcNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, System& system) : CalcNonbondedForceKernel(name, platform),
            hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
-            pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL),
+            pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL),
            pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL) {
    }
    ~OpenCLCalcNonbondedForceKernel();
@@ -586,15 +586,15 @@ public:
     */
    void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
 private:
-    struct SortTrait {
-        typedef mm_int2 DataType;
-        typedef cl_int KeyType;
-        static const char* clDataType() {return "int2";}
-        static const char* clKeyType() {return "int";}
-        static const char* clMinKey() {return "INT_MIN";}
-        static const char* clMaxKey() {return "INT_MAX";}
-        static const char* clMaxValue() {return "(int2) (INT_MAX, INT_MAX)";}
-        static const char* clSortKey() {return "value.y";}
+    class SortTrait : public OpenCLSort::SortTrait {
+        int getDataSize() const {return 8;}
+        int getKeySize() const {return 4;}
+        const char* getDataType() const {return "int2";}
+        const char* getKeyType() const {return "int";}
+        const char* getMinKey() const {return "INT_MIN";}
+        const char* getMaxKey() const {return "INT_MAX";}
+        const char* getMaxValue() const {return "(int2) (INT_MAX, INT_MAX)";}
+        const char* getSortKey() const {return "value.y";}
    };
    OpenCLContext& cl;
    bool hasInitializedKernel;
@@ -607,10 +607,9 @@ private:
    OpenCLArray* pmeBsplineModuliY;
    OpenCLArray* pmeBsplineModuliZ;
    OpenCLArray* pmeBsplineTheta;
-    OpenCLArray* pmeBsplineDTheta;
    OpenCLArray* pmeAtomRange;
    OpenCLArray* pmeAtomGridIndex;
-    OpenCLSort<SortTrait>* sort;
+    OpenCLSort* sort;
    OpenCLFFT3D* fft;
    cl::Kernel ewaldSumsKernel;
    cl::Kernel ewaldForcesKernel;
@@ -625,7 +624,6 @@ private:
    std::map<std::string, std::string> pmeDefines;
    std::vector<std::pair<int, int> > exceptionAtoms;
    double ewaldSelfEnergy, dispersionCoefficient, alpha;
-    int interpolateForceThreads;
    bool hasCoulomb, hasLJ;
    static const int PmeOrder = 5;
 };
@@ -775,6 +773,8 @@ private:
    std::vector<bool> pairValueUsesParam, pairEnergyUsesParam, pairEnergyUsesValue;
    System& system;
    cl::Kernel pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
+    std::string pairValueSrc, pairEnergySrc;
+    std::map<std::string, std::string> pairValueDefines, pairEnergyDefines;
 };

 /**

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
--- a/platforms/opencl/src/OpenCLNonbondedUtilities.h
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.h
--- a/platforms/opencl/src/OpenCLParallelKernels.cpp
+++ b/platforms/opencl/src/OpenCLParallelKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -108,7 +108,7 @@ private:
 };

 OpenCLParallelCalcForcesAndEnergyKernel::OpenCLParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, OpenCLPlatform::PlatformData& data) :
-        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextTiles(data.contexts.size()), contextForces(NULL),
+        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()), contextForces(NULL),
        pinnedPositionBuffer(NULL), pinnedPositionMemory(NULL), pinnedForceBuffer(NULL), pinnedForceMemory(NULL) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
        kernels.push_back(Kernel(new OpenCLCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
@@ -126,6 +126,8 @@ OpenCLParallelCalcForcesAndEnergyKernel::~OpenCLParallelCalcForcesAndEnergyKerne
 void OpenCLParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system);
+    for (int i = 0; i < (int) contextNonbondedFractions.size(); i++)
+        contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size();
 }

 void OpenCLParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
@@ -172,30 +174,26 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
                numAtoms*(data.contexts.size()-1)*elementSize, pinnedForceMemory);
        cl.reduceBuffer(*contextForces, data.contexts.size());
        
-        // Balance work between the contexts by transferring a few nonbonded tiles from the context that
+        // Balance work between the contexts by transferring a little nonbonded work from the context that
        // finished last to the one that finished first.
        
        int firstIndex = 0, lastIndex = 0;
-        int totalTiles = 0;
        for (int i = 0; i < (int) completionTimes.size(); i++) {
            if (completionTimes[i] < completionTimes[firstIndex])
                firstIndex = i;
            if (completionTimes[i] > completionTimes[lastIndex])
                lastIndex = i;
-            contextTiles[i] = data.contexts[i]->getNonbondedUtilities().getNumTiles();
-            totalTiles += contextTiles[i];
-        }
-        int tilesToTransfer = totalTiles/1000;
-        if (tilesToTransfer < 1)
-            tilesToTransfer = 1;
-        if (tilesToTransfer > contextTiles[lastIndex])
-            tilesToTransfer = contextTiles[lastIndex];
-        contextTiles[firstIndex] += tilesToTransfer;
-        contextTiles[lastIndex] -= tilesToTransfer;
-        int startIndex = 0;
-        for (int i = 0; i < (int) contextTiles.size(); i++) {
-            data.contexts[i]->getNonbondedUtilities().setTileRange(startIndex, contextTiles[i]);
-            startIndex += contextTiles[i];
+        }
+        double fractionToTransfer = min(0.001, contextNonbondedFractions[lastIndex]);
+        contextNonbondedFractions[firstIndex] += fractionToTransfer;
+        contextNonbondedFractions[lastIndex] -= fractionToTransfer;
+        double startFraction = 0.0;
+        for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) {
+            double endFraction = startFraction+contextNonbondedFractions[i];
+            if (i == contextNonbondedFractions.size()-1)
+                endFraction = 1.0; // Avoid roundoff error
+            data.contexts[i]->getNonbondedUtilities().setAtomBlockRange(startFraction, endFraction);
+            startFraction = endFraction;
        }
    }
    return energy;

--- a/platforms/opencl/src/OpenCLParallelKernels.h
+++ b/platforms/opencl/src/OpenCLParallelKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011 Stanford University and the Authors.           *
+ * Portions copyright (c) 2011-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -80,7 +80,7 @@ private:
    OpenCLPlatform::PlatformData& data;
    std::vector<Kernel> kernels;
    std::vector<long long> completionTimes;
-    std::vector<int> contextTiles;
+    std::vector<double> contextNonbondedFractions;
    OpenCLArray* contextForces;
    cl::Buffer* pinnedPositionBuffer;
    cl::Buffer* pinnedForceBuffer;