Merged 5.1Optimizations branch back to trunk

93c467b2 · Peter Eastman · f6d4557d · 93c467b2 · 93c467b2 · 93c467b2
Commit 93c467b2 authored Mar 22, 2013 by Peter Eastman
20 changed files
--- a/platforms/cuda/src/kernels/langevin.cu
+++ b/platforms/cuda/src/kernels/langevin.cu
@@ -95,8 +95,8 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error
    if (blockIdx.x*blockDim.x+threadIdx.x == 0) {
        // Select the new step size.

-        mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
-        mixed newStepSize = sqrt(errorTol/totalError);
+        mixed totalError = SQRT(error[0]/(NUM_ATOMS*3));
+        mixed newStepSize = SQRT(errorTol/totalError);
        mixed oldStepSize = dt[0].y;
        if (oldStepSize > 0.0f)
            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
@@ -108,9 +108,9 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error

        // Recalculate the integration parameters.

-        mixed vscale = exp(-newStepSize/tau);
+        mixed vscale = EXP(-newStepSize/tau);
        mixed fscale = (1-vscale)*tau;
-        mixed noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau);
+        mixed noisescale = SQRT(2*kT/tau)*SQRT(0.5f*(1-vscale*vscale)*tau);
        params[VelScale] = vscale;
        params[ForceScale] = fscale;
        params[NoiseScale] = noisescale;

--- a/platforms/cuda/src/kernels/nonbonded.cu
+++ b/platforms/cuda/src/kernels/nonbonded.cu
--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
-extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4* __restrict__ pmeBsplineTheta, int2* __restrict__ pmeAtomGridIndex,
+extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex,
            real4 periodicBoxSize, real4 invPeriodicBoxSize) {
-    extern __shared__ real3 bsplinesCache[];
-    real3* data = &bsplinesCache[threadIdx.x*PME_ORDER];
-    const real3 scale = make_real3(RECIP(PME_ORDER-1));
+    // Compute the index of the grid point each atom is associated with.
+    
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
        real4 pos = posq[i];
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
@@ -11,11 +10,40 @@ extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4*
        real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
-        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
                                 ((int) t.y) % GRID_SIZE_Y,
                                 ((int) t.z) % GRID_SIZE_Z);
        pmeAtomGridIndex[i] = make_int2(i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
+    }
+}
+
+extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, const int2* __restrict__ pmeAtomGridIndex) {
+    real3 data[PME_ORDER];
+    const real scale = RECIP(PME_ORDER-1);
+    
+    // Process the atoms in spatially sorted order.  This improves efficiency when writing
+    // the grid values.
+    
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+        int atom = pmeAtomGridIndex[i].x;
+        real charge = posq[atom].w;
+        real3 force = make_real3(0);
+        real4 pos = posq[atom];
+        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
+        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
+        real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
+                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
+                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
+        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
+                                   ((int) t.y) % GRID_SIZE_Y,
+                                   ((int) t.z) % GRID_SIZE_Z);
+
+        // Since we need the full set of thetas, it's faster to compute them here than load them
+        // from global memory.
+        
+        real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
        data[PME_ORDER-1] = make_real3(0);
        data[1] = dr;
        data[0] = make_real3(1)-dr;
@@ -23,98 +51,46 @@ extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4*
            real div = RECIP(j-1);
            data[j-1] = div*dr*data[j-2];
            for (int k = 1; k < (j-1); k++)
-                data[j-k-1] = div*((dr+make_real3(k)) *data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
+                data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
            data[0] = div*(make_real3(1)-dr)*data[0];
        }
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
        for (int j = 1; j < (PME_ORDER-1); j++)
            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(make_real3(1)-dr)*data[0];
-        for (int j = 0; j < PME_ORDER; j++) {
-            real3 d = data[j]; // Copy it as a workaround for a bug in CUDA 5.0
-            pmeBsplineTheta[i+j*NUM_ATOMS] = make_real4(d.x, d.y, d.z, pos.w);  // Storing the charge here improves cache coherency in the charge spreading kernel
-        }
-    }
-}
-
-/**
- * For each grid point, find the range of sorted atoms associated with that point.
- */
-extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange, const real4* __restrict__ posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
-    int start = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x))/(blockDim.x*gridDim.x);
-    int end = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x+1))/(blockDim.x*gridDim.x);
-    int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
-    for (int i = start; i < end; ++i) {
-        int2 atomData = pmeAtomGridIndex[i];
-        int gridIndex = atomData.y;
-        if (gridIndex != last) {
-            for (int j = last+1; j <= gridIndex; ++j)
-                pmeAtomRange[j] = i;
-            last = gridIndex;
-        }
-    }
-
-    // Fill in values beyond the last atom.
-    
-    if (blockIdx.x == gridDim.x-1 && threadIdx.x == blockDim.x-1) {
-        int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
-        for (int j = last+1; j <= gridSize; ++j)
-            pmeAtomRange[j] = NUM_ATOMS;
-    }
-}
-
-#define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)
-extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
-        const real4* __restrict__ pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
-    int ix = threadIdx.x/(PME_ORDER*PME_ORDER);
-    int remainder = threadIdx.x-ix*PME_ORDER*PME_ORDER;
-    int iy = remainder/PME_ORDER;
-    int iz = remainder-iy*PME_ORDER;
-    __shared__ real4 theta[PME_ORDER];
-    __shared__ real charge[BUFFER_SIZE];
-    __shared__ int basex[BUFFER_SIZE];
-    __shared__ int basey[BUFFER_SIZE];
-    __shared__ int basez[BUFFER_SIZE];
-    if (ix < PME_ORDER) {
-        for (int baseIndex = blockIdx.x*BUFFER_SIZE; baseIndex < NUM_ATOMS; baseIndex += gridDim.x*BUFFER_SIZE) {
-            // Load the next block of atoms into the buffers.
+        
+        // Spread the charge from this atom onto each grid point.
+         
+        for (int ix = 0; ix < PME_ORDER; ix++) {
+            int xbase = gridIndex.x+ix;
+            xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+            xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
+            real dx = data[ix].x;
+            
+            for (int iy = 0; iy < PME_ORDER; iy++) {
+                int ybase = gridIndex.y+iy;
+                ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
+                ybase = xbase + ybase*GRID_SIZE_Z;
+                real dy = data[iy].y;
+                
+                for (int iz = 0; iz < PME_ORDER; iz++) {
+                    int zindex = gridIndex.z+iz;
+                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+                    int index = ybase + zindex;

-            int atomIndex = baseIndex+threadIdx.x;
-            if (atomIndex < NUM_ATOMS) {
-                real4 pos = posq[atomIndex];
-                charge[threadIdx.x] = pos.w;
-                pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
-                pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
-                pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
-                basex[threadIdx.x] = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X);
-                basey[threadIdx.x] = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y);
-                basez[threadIdx.x] = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
-            }
-            __syncthreads();
-            int lastIndex = min(BUFFER_SIZE, NUM_ATOMS-baseIndex);
-            for (int index = 0; index < lastIndex; index++) {
-                int atomIndex = index+baseIndex;
-                if (threadIdx.x < PME_ORDER)
-                    theta[threadIdx.x] = pmeBsplineTheta[atomIndex+threadIdx.x*NUM_ATOMS];
-                __syncthreads();
-                real add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z;
-                int x = basex[index]+ix;
-                int y = basey[index]+iy;
-                int z = basez[index]+iz;
-                x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0);
-                y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
-                z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+                    real add = charge*dx*dy*data[iz].z;
 #ifdef USE_DOUBLE_PRECISION
-                unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
-                atomicAdd(&ulonglong_p[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z],  static_cast<unsigned long long>((long long) (add*0x100000000)));
+                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
+                    atomicAdd(&ulonglong_p[index],  static_cast<unsigned long long>((long long) (add*0x100000000)));
 #elif __CUDA_ARCH__ < 200
-                unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
-                int gridIndex = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z;
-                gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2);
-                atomicAdd(&ulonglong_p[gridIndex],  static_cast<unsigned long long>((long long) (add*0x100000000)));
+                    unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
+                    int gridIndex = index;
+                    gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2);
+                    atomicAdd(&ulonglong_p[gridIndex],  static_cast<unsigned long long>((long long) (add*0x100000000)));
 #else
-                atomicAdd(&originalPmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], add*EPSILON_FACTOR);
+                    atomicAdd(&originalPmeGrid[index], add*EPSILON_FACTOR);
 #endif
+                }
            }
        }
    }
@@ -182,48 +158,52 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict__ e
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
 
-	real energy = 0;
+    real energy = 0;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
        // real indices
        int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));
        int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z);
        int ky = remainder/(GRID_SIZE_Z);
        int kz = remainder-ky*(GRID_SIZE_Z);
-		int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
-		int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
-		int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
-		real mhx = mx*invPeriodicBoxSize.x;
-		real mhy = my*invPeriodicBoxSize.y;
-		real mhz = mz*invPeriodicBoxSize.z;
-		real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
-		real bx = pmeBsplineModuliX[kx];
-		real by = pmeBsplineModuliY[ky];
-		real bz = pmeBsplineModuliZ[kz];
-		real denom = m2*bx*by*bz;
-		real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
+    	int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
+    	int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
+    	int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
+    	real mhx = mx*invPeriodicBoxSize.x;
+    	real mhy = my*invPeriodicBoxSize.y;
+    	real mhz = mz*invPeriodicBoxSize.z;
+    	real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
+    	real bx = pmeBsplineModuliX[kx];
+    	real by = pmeBsplineModuliY[ky];
+    	real bz = pmeBsplineModuliZ[kz];
+    	real denom = m2*bx*by*bz;
+    	real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;

-		if(kz >= (GRID_SIZE_Z/2+1)) {
-			kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
-			ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
-			kz = GRID_SIZE_Z-kz;
-		} 
-		int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
-		real2 grid = halfcomplex_pmeGrid[indexInHalfComplexGrid];
-		if (kx != 0 || ky != 0 || kz != 0) {
-			energy += eterm*(grid.x*grid.x + grid.y*grid.y);
-		}
+    	if(kz >= (GRID_SIZE_Z/2+1)) {
+        	kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
+        	ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
+        	kz = GRID_SIZE_Z-kz;
+        } 
+    	int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
+    	real2 grid = halfcomplex_pmeGrid[indexInHalfComplexGrid];
+    	if (kx != 0 || ky != 0 || kz != 0) {
+        	energy += eterm*(grid.x*grid.x + grid.y*grid.y);
+        }
    }
 	energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
 }

 extern "C" __global__
 void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid,
-        real4 periodicBoxSize, real4 invPeriodicBoxSize) {
+        real4 periodicBoxSize, real4 invPeriodicBoxSize, const int2* __restrict__ pmeAtomGridIndex) {
    real3 data[PME_ORDER];
    real3 ddata[PME_ORDER];
    const real scale = RECIP(PME_ORDER-1);
-     
-    for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
+    
+    // Process the atoms in spatially sorted order.  This improves cache performance when loading
+    // the grid values.
+    
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
+        int atom = pmeAtomGridIndex[i].x;
        real3 force = make_real3(0);
        real4 pos = posq[atom];
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
@@ -233,8 +213,8 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
                             (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
                             (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
        int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
-                                 ((int) t.y) % GRID_SIZE_Y,
-                                 ((int) t.z) % GRID_SIZE_Z);
+                                   ((int) t.y) % GRID_SIZE_Y,
+                                   ((int) t.z) % GRID_SIZE_Z);

        // Since we need the full set of thetas, it's faster to compute them here than load them
        // from global memory.
@@ -243,7 +223,6 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
        data[PME_ORDER-1] = make_real3(0);
        data[1] = dr;
        data[0] = make_real3(1)-dr;
-
        for (int j = 3; j < PME_ORDER; j++) {
            real div = RECIP(j-1);
            data[j-1] = div*dr*data[j-2];
@@ -252,15 +231,13 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
            data[0] = div*(make_real3(1)-dr)*data[0];
        }
        ddata[0] = -data[0];
-         
        for (int j = 1; j < PME_ORDER; j++)
            ddata[j] = data[j-1]-data[j];
        data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
-        
        for (int j = 1; j < (PME_ORDER-1); j++)
            data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
        data[0] = scale*(make_real3(1)-dr)*data[0];
-
+        
        // Compute the force on this atom.
         
        for (int ix = 0; ix < PME_ORDER; ix++) {

--- a/platforms/cuda/src/kernels/sort.cu
+++ b/platforms/cuda/src/kernels/sort.cu
@@ -3,7 +3,49 @@ __device__ KEY_TYPE getValue(DATA_TYPE value) {
 }

 extern "C" {
+
+/**
+ * Sort a list that is short enough to entirely fit in local memory.  This is executed as
+ * a single thread block.
+ */
+__global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length) {
+    // Load the data into local memory.
    
+    extern __shared__ DATA_TYPE dataBuffer[];
+    for (int index = threadIdx.x; index < length; index += blockDim.x)
+        dataBuffer[index] = data[index];
+    __syncthreads();
+
+    // Perform a bitonic sort in local memory.
+
+    for (unsigned int k = 2; k < 2*length; k *= 2) {
+        for (unsigned int j = k/2; j > 0; j /= 2) {
+            for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
+                int ixj = i^j;
+                if (ixj > i && ixj < length) {
+                    DATA_TYPE value1 = dataBuffer[i];
+                    DATA_TYPE value2 = dataBuffer[ixj];
+                    bool ascending = ((i&k) == 0);
+                    for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
+                        ascending = ((i&mask) == 0 ? !ascending : ascending);
+                    KEY_TYPE lowKey  = (ascending ? getValue(value1) : getValue(value2));
+                    KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
+                    if (lowKey > highKey) {
+                        dataBuffer[i] = value2;
+                        dataBuffer[ixj] = value1;
+                    }
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    // Write the data back to global memory.
+
+    for (int index = threadIdx.x; index < length; index += blockDim.x)
+        data[index] = dataBuffer[index];
+}
+
 /**
 * Calculate the minimum and maximum value in the array to be sorted.  This kernel
 * is executed as a single work group.

--- a/platforms/cuda/src/kernels/torsionForce.cu
+++ b/platforms/cuda/src/kernels/torsionForce.cu
@@ -16,12 +16,12 @@ if (cosangle > 0.99f || cosangle < -0.99f) {
        theta = PI-theta;
 }
 else
-   theta = acos(cosangle);
+   theta = ACOS(cosangle);
 theta = (dot(v0, cp1) >= 0 ? theta : -theta);
 COMPUTE_FORCE
 real normCross1 = dot(cp0, cp0);
 real normSqrBC = dot(v1, v1);
-real normBC = sqrt(normSqrBC);
+real normBC = SQRT(normSqrBC);
 real normCross2 = dot(cp1, cp1);
 real dp = RECIP(normSqrBC);
 real4 ff = make_real4((-dEdAngle*normBC)/normCross1, dot(v0, v1)*dp, dot(v2, v1)*dp, (dEdAngle*normBC)/normCross2);

--- a/platforms/cuda/src/kernels/verlet.cu
+++ b/platforms/cuda/src/kernels/verlet.cu
@@ -93,8 +93,8 @@ extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTo
        __syncthreads();
    }
    if (threadIdx.x == 0) {
-        mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
-        mixed newStepSize = sqrt(errorTol/totalError);
+        mixed totalError = SQRT(error[0]/(NUM_ATOMS*3));
+        mixed newStepSize = SQRT(errorTol/totalError);
        mixed oldStepSize = dt[0].y;
        if (oldStepSize > 0.0f)
            newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.

--- a/platforms/cuda/tests/TestCudaNonbondedForce.cpp
+++ b/platforms/cuda/tests/TestCudaNonbondedForce.cpp
--- a/platforms/cuda/tests/TestCudaSort.cpp
+++ b/platforms/cuda/tests/TestCudaSort.cpp
--- a/platforms/opencl/src/OpenCLBondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLBondedUtilities.cpp
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
--- a/platforms/opencl/src/OpenCLFFT3D.cpp
+++ b/platforms/opencl/src/OpenCLFFT3D.cpp
--- a/platforms/opencl/src/OpenCLFFT3D.h
+++ b/platforms/opencl/src/OpenCLFFT3D.h
@@ -81,8 +81,9 @@ public:
     */
    static int findLegalDimension(int minimum);
 private:
-    cl::Kernel createKernel(int xsize, int ysize, int zsize);
+    cl::Kernel createKernel(int xsize, int ysize, int zsize, int& threads);
    int xsize, ysize, zsize;
+    int xthreads, ythreads, zthreads;
    OpenCLContext& context;
    cl::Kernel xkernel, ykernel, zkernel;
 };

--- a/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
--- a/platforms/opencl/src/OpenCLIntegrationUtilities.h
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.h
@@ -141,8 +141,6 @@ private:
    OpenCLArray* ccmaDelta1;
    OpenCLArray* ccmaDelta2;
    OpenCLArray* ccmaConverged;
-    cl::Buffer* ccmaConvergedBuffer;
-    cl_int* ccmaConvergedMemory;
    OpenCLArray* vsite2AvgAtoms;
    OpenCLArray* vsite2AvgWeights;
    OpenCLArray* vsite3AvgAtoms;

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
--- a/platforms/opencl/src/OpenCLKernels.h
+++ b/platforms/opencl/src/OpenCLKernels.h
--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
--- a/platforms/opencl/src/OpenCLNonbondedUtilities.h
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.h
--- a/platforms/opencl/src/OpenCLParallelKernels.cpp
+++ b/platforms/opencl/src/OpenCLParallelKernels.cpp
--- a/platforms/opencl/src/OpenCLParallelKernels.h
+++ b/platforms/opencl/src/OpenCLParallelKernels.h