Merge branch 'master' into pthreads

Conflicts: libraries/pthreads/include/pthread.h

Merge branch 'master' into pthreads
Conflicts: libraries/pthreads/include/pthread.h
59bccb15 · peastman · 6cf75568 · 4bdbcf4d · 59bccb15 · 59bccb15
Commit 59bccb15 authored May 15, 2015 by peastman
20 changed files
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -106,9 +106,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
                // if they supplied a valid deviceIndex, we only look through that one
                if (i != deviceIndex && deviceIndex >= 0 && deviceIndex < (int) devices.size())
                    continue;
-
-                if (platformVendor == "Apple" && (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU || devices[i].getInfo<CL_DEVICE_VENDOR>() == "AMD"))
-                    continue; // The CPU device on OS X won't work correctly, and there are serious bugs using AMD GPUs.
+                if (platformVendor == "Apple" && (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU))
+                    continue; // The CPU device on OS X won't work correctly.
                int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
                int processingElementsPerComputeUnit = 8;
                if (devices[i].getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
@@ -170,6 +169,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
        if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel")
            defaultOptimizationOptions = "";
+        else if (platformVendor == "Apple")
+            defaultOptimizationOptions = "-cl-mad-enable -cl-no-signed-zeros";
        else
            defaultOptimizationOptions = "-cl-fast-relaxed-math";
        supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos);
@@ -241,8 +242,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        }
        else
            simdWidth = 1;
-        if (platformVendor == "Apple" && vendor == "AMD")
-            compilationDefines["MAC_AMD_WORKAROUND"] = "";
        if (supports64BitGlobalAtomics)
            compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
        if (supportsDoublePrecision)

--- a/platforms/opencl/src/OpenCLExpressionUtilities.cpp
+++ b/platforms/opencl/src/OpenCLExpressionUtilities.cpp
@@ -455,6 +455,9 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
        case Operation::CEIL:
            out << "ceil(" << getTempName(node.getChildren()[0], temps) << ")";
            break;
+        case Operation::SELECT:
+            out << "(" << getTempName(node.getChildren()[0], temps) << " != 0 ? " << getTempName(node.getChildren()[1], temps) << " : " << getTempName(node.getChildren()[2], temps) << ")";
+            break;
        default:
            throw OpenMMException("Internal error: Unknown operation in user-defined expression: "+node.getOperation().getName());
    }

--- a/platforms/opencl/src/OpenCLFFT3D.cpp
+++ b/platforms/opencl/src/OpenCLFFT3D.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -35,25 +35,109 @@
 using namespace OpenMM;
 using namespace std;

-OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize) : context(context), xsize(xsize), ysize(ysize), zsize(zsize) {
-    zkernel = createKernel(xsize, ysize, zsize, zthreads);
-    xkernel = createKernel(ysize, zsize, xsize, xthreads);
-    ykernel = createKernel(zsize, xsize, ysize, ythreads);
+OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize, bool realToComplex) :
+        context(context), xsize(xsize), ysize(ysize), zsize(zsize) {
+    packRealAsComplex = false;
+    int packedXSize = xsize;
+    int packedYSize = ysize;
+    int packedZSize = zsize;
+    if (realToComplex) {
+        // If any axis size is even, we can pack the real values into a complex grid that is only half as large.
+        // Look for an appropriate axis.
+        
+        packRealAsComplex = true;
+        int packedAxis, bufferSize;
+        if (xsize%2 == 0) {
+            packedAxis = 0;
+            packedXSize /= 2;
+            bufferSize = packedXSize;
+        }
+        else if (ysize%2 == 0) {
+            packedAxis = 1;
+            packedYSize /= 2;
+            bufferSize = packedYSize;
+        }
+        else if (zsize%2 == 0) {
+            packedAxis = 2;
+            packedZSize /= 2;
+            bufferSize = packedZSize;
+        }
+        else
+            packRealAsComplex = false;
+        if (packRealAsComplex) {
+            // Build the kernels for packing and unpacking the data.
+            
+            map<string, string> defines;
+            defines["XSIZE"] = context.intToString(xsize);
+            defines["YSIZE"] = context.intToString(ysize);
+            defines["ZSIZE"] = context.intToString(zsize);
+            defines["PACKED_AXIS"] = context.intToString(packedAxis);
+            defines["PACKED_XSIZE"] = context.intToString(packedXSize);
+            defines["PACKED_YSIZE"] = context.intToString(packedYSize);
+            defines["PACKED_ZSIZE"] = context.intToString(packedZSize);
+            defines["M_PI"] = context.doubleToString(M_PI);
+            cl::Program program = context.createProgram(OpenCLKernelSources::fftR2C, defines);
+            packForwardKernel = cl::Kernel(program, "packForwardData");
+            unpackForwardKernel = cl::Kernel(program, "unpackForwardData");
+            unpackForwardKernel.setArg(2, bufferSize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)), NULL);
+            packBackwardKernel = cl::Kernel(program, "packBackwardData");
+            packBackwardKernel.setArg(2, bufferSize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)), NULL);
+            unpackBackwardKernel = cl::Kernel(program, "unpackBackwardData");
+        }
+    }
+    bool inputIsReal = (realToComplex && !packRealAsComplex);
+    zkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, true, inputIsReal);
+    xkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, true, inputIsReal);
+    ykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, true, inputIsReal);
+    invzkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, false, inputIsReal);
+    invxkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, false, inputIsReal);
+    invykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, false, inputIsReal);
 }

 void OpenCLFFT3D::execFFT(OpenCLArray& in, OpenCLArray& out, bool forward) {
-    zkernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
-    zkernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
-    zkernel.setArg<cl_int>(2, forward ? 1 : -1);
-    context.executeKernel(zkernel, xsize*ysize*zsize, zthreads);
-    xkernel.setArg<cl::Buffer>(0, out.getDeviceBuffer());
-    xkernel.setArg<cl::Buffer>(1, in.getDeviceBuffer());
-    xkernel.setArg<cl_int>(2, forward ? 1 : -1);
-    context.executeKernel(xkernel, xsize*ysize*zsize, xthreads);
-    ykernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
-    ykernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
-    ykernel.setArg<cl_int>(2, forward ? 1 : -1);
-    context.executeKernel(ykernel, xsize*ysize*zsize, ythreads);
+    cl::Kernel kernel1 = (forward ? zkernel : invzkernel);
+    cl::Kernel kernel2 = (forward ? xkernel : invxkernel);
+    cl::Kernel kernel3 = (forward ? ykernel : invykernel);
+    if (packRealAsComplex) {
+        cl::Kernel packKernel = (forward ? packForwardKernel : packBackwardKernel);
+        cl::Kernel unpackKernel = (forward ? unpackForwardKernel : unpackBackwardKernel);
+        int gridSize = xsize*ysize*zsize/2;
+
+        // Pack the data into a half sized grid.
+        
+        packKernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
+        packKernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
+        context.executeKernel(packKernel, gridSize);
+        
+        // Perform the FFT.
+        
+        kernel1.setArg<cl::Buffer>(0, out.getDeviceBuffer());
+        kernel1.setArg<cl::Buffer>(1, in.getDeviceBuffer());
+        context.executeKernel(kernel1, gridSize, zthreads);
+        kernel2.setArg<cl::Buffer>(0, in.getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(1, out.getDeviceBuffer());
+        context.executeKernel(kernel2, gridSize, xthreads);
+        kernel3.setArg<cl::Buffer>(0, out.getDeviceBuffer());
+        kernel3.setArg<cl::Buffer>(1, in.getDeviceBuffer());
+        context.executeKernel(kernel3, gridSize, ythreads);
+        
+        // Unpack the data.
+        
+        unpackKernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
+        unpackKernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
+        context.executeKernel(unpackKernel, gridSize);
+    }
+    else {
+        kernel1.setArg<cl::Buffer>(0, in.getDeviceBuffer());
+        kernel1.setArg<cl::Buffer>(1, out.getDeviceBuffer());
+        context.executeKernel(kernel1, xsize*ysize*zsize, zthreads);
+        kernel2.setArg<cl::Buffer>(0, out.getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(1, in.getDeviceBuffer());
+        context.executeKernel(kernel2, xsize*ysize*zsize, xthreads);
+        kernel3.setArg<cl::Buffer>(0, in.getDeviceBuffer());
+        kernel3.setArg<cl::Buffer>(1, out.getDeviceBuffer());
+        context.executeKernel(kernel3, xsize*ysize*zsize, ythreads);
+    }
 }

 int OpenCLFFT3D::findLegalDimension(int minimum) {
@@ -73,8 +157,10 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
    }
 }

-cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads) {
+cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward, bool inputIsReal) {
    int maxThreads = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
+    while (maxThreads > 128 && maxThreads-64 >= zsize)
+        maxThreads -= 64;
    bool isCPU = context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU;
    while (true) {
        bool loopRequired = (zsize > maxThreads || isCPU);
@@ -137,10 +223,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
                source<<"real2 b2 = "<<context.doubleToString((2*cos(2*M_PI/7)-cos(4*M_PI/7)-cos(6*M_PI/7))/3)<<"*(d0-d4);\n";
                source<<"real2 b3 = "<<context.doubleToString((cos(2*M_PI/7)-2*cos(4*M_PI/7)+cos(6*M_PI/7))/3)<<"*(d4-d2);\n";
                source<<"real2 b4 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)-2*cos(6*M_PI/7))/3)<<"*(d2-d0);\n";
-                source<<"real2 b5 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n";
-                source<<"real2 b6 = -sign*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n";
-                source<<"real2 b7 = -sign*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n";
-                source<<"real2 b8 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n";
+                source<<"real2 b5 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n";
+                source<<"real2 b6 = -(SIGN)*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n";
+                source<<"real2 b7 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n";
+                source<<"real2 b8 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n";
                source<<"real2 t0 = b0+b1;\n";
                source<<"real2 t1 = b2+b3;\n";
                source<<"real2 t2 = b4-b3;\n";
@@ -178,8 +264,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
                source<<"real2 d7 = d6+d5;\n";
                source<<"real2 d8 = d6-d5;\n";
                string coeff = context.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
-                source<<"real2 d9 = sign*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
-                source<<"real2 d10 = sign*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
+                source<<"real2 d9 = (SIGN)*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
+                source<<"real2 d10 = (SIGN)*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
                source<<"data"<<output<<"[base+4*j*"<<m<<"] = c0+d4;\n";
                source<<"data"<<output<<"[base+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
                source<<"data"<<output<<"[base+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
@@ -194,7 +280,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
                source<<"real2 d0 = c0+c2;\n";
                source<<"real2 d1 = c0-c2;\n";
                source<<"real2 d2 = c1+c3;\n";
-                source<<"real2 d3 = sign*(real2) (c1.y-c3.y, c3.x-c1.x);\n";
+                source<<"real2 d3 = (SIGN)*(real2) (c1.y-c3.y, c3.x-c1.x);\n";
                source<<"data"<<output<<"[base+3*j*"<<m<<"] = d0+d2;\n";
                source<<"data"<<output<<"[base+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
                source<<"data"<<output<<"[base+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
@@ -206,7 +292,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
                source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
                source<<"real2 d0 = c1+c2;\n";
                source<<"real2 d1 = c0-0.5f*d0;\n";
-                source<<"real2 d2 = sign*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n";
+                source<<"real2 d2 = (SIGN)*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n";
                source<<"data"<<output<<"[base+2*j*"<<m<<"] = c0+d0;\n";
                source<<"data"<<output<<"[base+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
                source<<"data"<<output<<"[base+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
@@ -226,13 +312,27 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa

        // Create the kernel.

+        bool outputIsReal = (inputIsReal && axis == 2 && !forward);
+        bool outputIsPacked = (inputIsReal && axis == 2 && forward);
+        string outputSuffix = (outputIsReal ? ".x" : "");
        if (loopRequired) {
+            if (outputIsPacked)
+                source<<"if (x < XSIZE/2+1)\n";
            source<<"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))\n";
-            source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z];\n";
+            if (outputIsPacked)
+                source<<"out[y*(ZSIZE*(XSIZE/2+1))+z*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[z]"<<outputSuffix<<";\n";
+            else
+                source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z]"<<outputSuffix<<";\n";
+        }
+        else {
+            if (outputIsPacked) {
+                source<<"if (index < XSIZE*YSIZE && x < XSIZE/2+1)\n";
+                source<<"out[y*(ZSIZE*(XSIZE/2+1))+(get_local_id(0)%ZSIZE)*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n";
            }
            else {
                source<<"if (index < XSIZE*YSIZE)\n";
-            source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n";
+                source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n";
+            }
        }
        map<string, string> replacements;
        replacements["XSIZE"] = context.intToString(xsize);
@@ -242,6 +342,12 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
        replacements["M_PI"] = context.doubleToString(M_PI);
        replacements["COMPUTE_FFT"] = source.str();
        replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0");
+        replacements["SIGN"] = (forward ? "1" : "-1");
+        replacements["INPUT_TYPE"] = (inputIsReal && axis == 0 && forward ? "real" : "real2");
+        replacements["OUTPUT_TYPE"] = (outputIsReal ? "real" : "real2");
+        replacements["INPUT_IS_REAL"] = (inputIsReal && axis == 0 && forward ? "1" : "0");
+        replacements["INPUT_IS_PACKED"] = (inputIsReal && axis == 0 && !forward ? "1" : "0");
+        replacements["OUTPUT_IS_PACKED"] = (outputIsPacked ? "1" : "0");
        cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements));
        cl::Kernel kernel(program, "execFFT");
        threads = (isCPU ? 1 : blocksPerGroup*zsize);
@@ -253,9 +359,9 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
            continue;
        }
        int bufferSize = blocksPerGroup*zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
+        kernel.setArg(2, bufferSize, NULL);
        kernel.setArg(3, bufferSize, NULL);
        kernel.setArg(4, bufferSize, NULL);
-        kernel.setArg(5, bufferSize, NULL);
        return kernel;
    }
 }
--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1628,6 +1628,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
            pmeDefines["GRID_SIZE_Y"] = cl.intToString(gridSizeY);
            pmeDefines["GRID_SIZE_Z"] = cl.intToString(gridSizeZ);
            pmeDefines["EPSILON_FACTOR"] = cl.doubleToString(sqrt(ONE_4PI_EPS0));
+            pmeDefines["M_PI"] = cl.doubleToString(M_PI);
            bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
            if (deviceIsCpu)
                pmeDefines["DEVICE_IS_CPU"] = "1";
@@ -1652,8 +1653,11 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb

                int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
                pmeGrid = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
-                cl.addAutoclearBuffer(*pmeGrid);
                pmeGrid2 = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid2");
+                if (cl.getSupports64BitGlobalAtomics())
+                    cl.addAutoclearBuffer(*pmeGrid2);
+                else
+                    cl.addAutoclearBuffer(*pmeGrid);
                pmeBsplineModuliX = new OpenCLArray(cl, gridSizeX, elementSize, "pmeBsplineModuliX");
                pmeBsplineModuliY = new OpenCLArray(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
                pmeBsplineModuliZ = new OpenCLArray(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
@@ -1661,9 +1665,12 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
                pmeAtomRange = OpenCLArray::create<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
                pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
                sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms());
-                fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);
+                fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ, true);
                string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>();
-                usePmeQueue = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
+                bool isNvidia = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
+                if (isNvidia)
+                    pmeDefines["USE_ALTERNATE_MEMORY_ACCESS_PATTERN"] = "1";
+                usePmeQueue = isNvidia;
                if (usePmeQueue) {
                    pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice());
                    int recipForceGroup = force.getReciprocalSpaceForceGroup();
@@ -1800,6 +1807,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
            pmeZIndexKernel = cl::Kernel(program, "recordZIndex");
            pmeSpreadChargeKernel = cl::Kernel(program, "gridSpreadCharge");
            pmeConvolutionKernel = cl::Kernel(program, "reciprocalConvolution");
+            pmeEvalEnergyKernel = cl::Kernel(program, "gridEvaluateEnergy");
            pmeInterpolateForceKernel = cl::Kernel(program, "gridInterpolateForce");
            int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
@@ -1814,20 +1822,28 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
            pmeSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex->getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeAtomRange->getDeviceBuffer());
+            if (cl.getSupports64BitGlobalAtomics())
+                pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid2->getDeviceBuffer());
+            else
                pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid->getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta->getDeviceBuffer());
            pmeConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
-            pmeConvolutionKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
-            pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer());
-            pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer());
-            pmeConvolutionKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer());
+            pmeConvolutionKernel.setArg<cl::Buffer>(1, pmeBsplineModuliX->getDeviceBuffer());
+            pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliY->getDeviceBuffer());
+            pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliZ->getDeviceBuffer());
+            pmeEvalEnergyKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
+            pmeEvalEnergyKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
+            pmeEvalEnergyKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer());
+            pmeEvalEnergyKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer());
+            pmeEvalEnergyKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer());
            pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
            pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
            pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid->getDeviceBuffer());
            pmeInterpolateForceKernel.setArg<cl::Buffer>(7, pmeAtomGridIndex->getDeviceBuffer());
            if (cl.getSupports64BitGlobalAtomics()) {
                pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
-                pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid->getDeviceBuffer());
+                pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
+                pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid->getDeviceBuffer());
            }
       }
    }
@@ -1851,7 +1867,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
        cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
    }
    if (pmeGrid != NULL && includeReciprocal) {
-        if (usePmeQueue)
+        if (usePmeQueue && !includeEnergy)
            cl.setQueue(pmeQueue);
        
        // Invert the periodic box vectors.
@@ -1926,19 +1942,24 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
        }
        fft->execFFT(*pmeGrid, *pmeGrid2, true);
        mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
-        double scaleFactor = 1.0/(M_PI*boxSize.x*boxSize.y*boxSize.z);
        if (cl.getUseDoublePrecision()) {
-            pmeConvolutionKernel.setArg<mm_double4>(5, recipBoxVectors[0]);
-            pmeConvolutionKernel.setArg<mm_double4>(6, recipBoxVectors[1]);
-            pmeConvolutionKernel.setArg<mm_double4>(7, recipBoxVectors[2]);
-            pmeConvolutionKernel.setArg<cl_double>(8, scaleFactor);
+            pmeConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
+            pmeConvolutionKernel.setArg<mm_double4>(5, recipBoxVectors[1]);
+            pmeConvolutionKernel.setArg<mm_double4>(6, recipBoxVectors[2]);
+            pmeEvalEnergyKernel.setArg<mm_double4>(5, recipBoxVectors[0]);
+            pmeEvalEnergyKernel.setArg<mm_double4>(6, recipBoxVectors[1]);
+            pmeEvalEnergyKernel.setArg<mm_double4>(7, recipBoxVectors[2]);
        }
        else {
-            pmeConvolutionKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[0]);
-            pmeConvolutionKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[1]);
-            pmeConvolutionKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]);
-            pmeConvolutionKernel.setArg<cl_float>(8, (float) scaleFactor);
-        }
+            pmeConvolutionKernel.setArg<mm_float4>(4, recipBoxVectorsFloat[0]);
+            pmeConvolutionKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[1]);
+            pmeConvolutionKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[2]);
+            pmeEvalEnergyKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[0]);
+            pmeEvalEnergyKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[1]);
+            pmeEvalEnergyKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]);
+        }
+        if (includeEnergy)
+            cl.executeKernel(pmeEvalEnergyKernel, cl.getNumAtoms());
        cl.executeKernel(pmeConvolutionKernel, cl.getNumAtoms());
        fft->execFFT(*pmeGrid2, *pmeGrid, false);
        setPeriodicBoxSizeArg(cl, pmeInterpolateForceKernel, 3);
@@ -5972,7 +5993,7 @@ string OpenCLIntegrateCustomStepKernel::createGlobalComputation(const string& va
    variables["dt"] = "dt[0].y";
    variables["uniform"] = "uniform";
    variables["gaussian"] = "gaussian";
-    variables[energyName] = "energy[0]";
+    variables[energyName] = "energy";
    for (int i = 0; i < integrator.getNumGlobalVariables(); i++)
        variables[integrator.getGlobalVariableName(i)] = "globals["+cl.intToString(i)+"]";
    for (int i = 0; i < (int) parameterNames.size(); i++)
@@ -6008,7 +6029,7 @@ string OpenCLIntegrateCustomStepKernel::createPerDofComputation(const string& va
    variables["m"] = "mass";
    variables["dt"] = "stepSize";
    if (energyName != "")
-        variables[energyName] = "energy[0]";
+        variables[energyName] = "energy";
    for (int i = 0; i < integrator.getNumGlobalVariables(); i++)
        variables[integrator.getGlobalVariableName(i)] = "globals["+cl.intToString(i)+"]";
    for (int i = 0; i < integrator.getNumPerDofVariables(); i++)
@@ -6238,8 +6259,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
                kernel.setArg<cl::Buffer>(index++, globalValues->getDeviceBuffer());
                kernel.setArg<cl::Buffer>(index++, contextParameterValues->getDeviceBuffer());
                kernel.setArg<cl::Buffer>(index++, sumBuffer->getDeviceBuffer());
-                index += 3;
-                kernel.setArg<cl::Buffer>(index++, potentialEnergy->getDeviceBuffer());
+                index += 4;
                for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++)
                    kernel.setArg<cl::Memory>(index++, perDofValues->getBuffers()[i].getMemory());
                if (stepType[step] == CustomIntegrator::ComputeSum) {
@@ -6284,8 +6304,6 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
                kernel.setArg<cl::Buffer>(index++, integration.getStepSize().getDeviceBuffer());
                kernel.setArg<cl::Buffer>(index++, globalValues->getDeviceBuffer());
                kernel.setArg<cl::Buffer>(index++, contextParameterValues->getDeviceBuffer());
-                index += 2;
-                kernel.setArg<cl::Buffer>(index++, potentialEnergy->getDeviceBuffer());
            }
            else if (stepType[step] == CustomIntegrator::ConstrainPositions) {
                // Apply position constraints.
@@ -6326,16 +6344,6 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
        randomKernel.setArg<cl::Buffer>(1, uniformRandoms->getDeviceBuffer());
        randomKernel.setArg<cl::Buffer>(2, randomSeed->getDeviceBuffer());
        
-        // Create the kernel for summing the potential energy.
-
-        cl::Program program = cl.createProgram(OpenCLKernelSources::customIntegrator, defines);
-        sumPotentialEnergyKernel = cl::Kernel(program, cl.getUseDoublePrecision() ? "computeDoubleSum" : "computeFloatSum");
-        int index = 0;
-        sumPotentialEnergyKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
-        sumPotentialEnergyKernel.setArg<cl::Buffer>(index++, potentialEnergy->getDeviceBuffer());
-        sumPotentialEnergyKernel.setArg<cl_int>(index++, 0);
-        sumPotentialEnergyKernel.setArg<cl_int>(index++, cl.getEnergyBuffer().getSize());
-        
        // Create the kernel for computing kinetic energy.

        stringstream computeKE;
@@ -6359,9 +6367,9 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
        replacements["PARAMETER_ARGUMENTS"] = args.str();
        if (defines.find("LOAD_POS_AS_DELTA") != defines.end())
            defines.erase("LOAD_POS_AS_DELTA");
-        program = cl.createProgram(cl.replaceStrings(OpenCLKernelSources::customIntegratorPerDof, replacements), defines);
+        cl::Program program = cl.createProgram(cl.replaceStrings(OpenCLKernelSources::customIntegratorPerDof, replacements), defines);
        kineticEnergyKernel = cl::Kernel(program, "computePerDof");
-        index = 0;
+        int index = 0;
        kineticEnergyKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
        setPosqCorrectionArg(cl, kineticEnergyKernel, index++);
        kineticEnergyKernel.setArg<cl::Buffer>(index++, integration.getPosDelta().getDeviceBuffer());
@@ -6373,7 +6381,10 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
        kineticEnergyKernel.setArg<cl::Buffer>(index++, sumBuffer->getDeviceBuffer());
        index += 2;
        kineticEnergyKernel.setArg<cl::Buffer>(index++, uniformRandoms->getDeviceBuffer());
-        kineticEnergyKernel.setArg<cl::Buffer>(index++, potentialEnergy->getDeviceBuffer());
+        if (cl.getUseDoublePrecision())
+            kineticEnergyKernel.setArg<cl_double>(index++, 0.0);
+        else
+            kineticEnergyKernel.setArg<cl_float>(index++, 0.0f);
        for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++)
            kineticEnergyKernel.setArg<cl::Memory>(index++, perDofValues->getBuffers()[i].getMemory());
        keNeedsForce = usesVariable(keExpression, "f");
@@ -6480,9 +6491,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
            }
            else {
                recordChangedParameters(context);
-                context.calcForcesAndEnergy(computeForce, computeEnergy, forceGroup[i]);
-                if (computeEnergy)
-                    cl.executeKernel(sumPotentialEnergyKernel, OpenCLContext::ThreadBlockSize, OpenCLContext::ThreadBlockSize);
+                energy = context.calcForcesAndEnergy(computeForce, computeEnergy, forceGroup[i]);
            forcesAreValid = true;
            }
        }
@@ -6490,6 +6499,10 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
            kernels[i][0].setArg<cl_uint>(10, integration.prepareRandomNumbers(requiredGaussian[i]));
            kernels[i][0].setArg<cl::Buffer>(9, integration.getRandom().getDeviceBuffer());
            kernels[i][0].setArg<cl::Buffer>(11, uniformRandoms->getDeviceBuffer());
+            if (cl.getUseDoublePrecision())
+                kernels[i][0].setArg<cl_double>(12, energy);
+            else
+                kernels[i][0].setArg<cl_float>(12, (cl_float) energy);
            if (requiredUniform[i] > 0)
                cl.executeKernel(randomKernel, numAtoms);
            cl.executeKernel(kernels[i][0], numAtoms);
@@ -6497,12 +6510,20 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
        else if (stepType[i] == CustomIntegrator::ComputeGlobal && !merged[i]) {
            kernels[i][0].setArg<cl_float>(3, (cl_float) SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber());
            kernels[i][0].setArg<cl_float>(4, (cl_float) SimTKOpenMMUtilities::getNormallyDistributedRandomNumber());
+            if (cl.getUseDoublePrecision())
+                kernels[i][0].setArg<cl_double>(5, energy);
+            else
+                kernels[i][0].setArg<cl_float>(5, (cl_float) energy);
            cl.executeKernel(kernels[i][0], 1, 1);
        }
        else if (stepType[i] == CustomIntegrator::ComputeSum) {
            kernels[i][0].setArg<cl_uint>(10, integration.prepareRandomNumbers(requiredGaussian[i]));
            kernels[i][0].setArg<cl::Buffer>(9, integration.getRandom().getDeviceBuffer());
            kernels[i][0].setArg<cl::Buffer>(11, uniformRandoms->getDeviceBuffer());
+            if (cl.getUseDoublePrecision())
+                kernels[i][0].setArg<cl_double>(12, energy);
+            else
+                kernels[i][0].setArg<cl_float>(12, (cl_float) energy);
            if (requiredUniform[i] > 0)
                cl.executeKernel(randomKernel, numAtoms);
            cl.clearBuffer(*sumBuffer);
@@ -6552,9 +6573,7 @@ double OpenCLIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& contex
        bool willNeedEnergy = false;
        for (int i = 0; i < integrator.getNumComputations(); i++)
            willNeedEnergy |= needsEnergy[i];
-        context.calcForcesAndEnergy(true, willNeedEnergy, -1);
-        if (willNeedEnergy)
-            cl.executeKernel(sumPotentialEnergyKernel, OpenCLContext::ThreadBlockSize, OpenCLContext::ThreadBlockSize);
+        energy = context.calcForcesAndEnergy(true, willNeedEnergy, -1);
        forcesAreValid = true;
    }
    cl.clearBuffer(*sumBuffer);

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -303,6 +303,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
        defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
        defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles->getSize());
        defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
+        defines["SIMD_WIDTH"] = context.intToString(context.getSIMDWidth());
        if (usePeriodic)
            defines["USE_PERIODIC"] = "1";
        int maxExclusions = 0;

--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -109,7 +109,7 @@ bool OpenCLPlatform::supportsDoublePrecision() const {

 bool OpenCLPlatform::isPlatformSupported() {
    // Return false for OpenCL implementations that are known
-    // to be buggy (Apple OSX since 10.7.5)
+    // to be buggy (Apple OS X prior to 10.10).

 #ifdef __APPLE__
    char str[256];
@@ -122,12 +122,10 @@ bool OpenCLPlatform::isPlatformSupported() {
    if (sscanf(str, "%d.%d.%d", &major, &minor, &micro) != 3)
        return false;

-    if ((major > 11) || (major == 11 && minor > 4) || (major == 11 && minor == 4 && micro >= 2))
-        // 11.4.2 is the darwin release corresponding to OSX 10.7.5, which is the
-        // point at which a number of serious bugs were introduced into the
-        // Apple OpenCL libraries, resulting in catistrophically incorrect MD simulations
-        // (see https://github.com/SimTk/openmm/issues/395 for example). Once a fix is released,
-        // this version check should be updated.
+    if (major < 14 || (major == 14 && minor < 3))
+        // 14.3.0 is the darwin release corresponding to OS X 10.10.3. Versions prior to that
+        // contained a number of serious bugs in the Apple OpenCL libraries.
+        // (See https://github.com/SimTk/openmm/issues/395 for example.)
        return false;
 #endif


--- a/platforms/opencl/src/OpenCLSort.cpp
+++ b/platforms/opencl/src/OpenCLSort.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010-2013 Stanford University and the Authors.      *
+ * Portions copyright (c) 2010-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -42,7 +42,6 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
    replacements["MIN_KEY"] = trait->getMinKey();
    replacements["MAX_KEY"] = trait->getMaxKey();
    replacements["MAX_VALUE"] = trait->getMaxValue();
-    replacements["VALUE_IS_INT2"] = (trait->getDataType() == std::string("int2") ? "1" : "0");
    cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
    shortListKernel = cl::Kernel(program, "sortShortList");
    computeRangeKernel = cl::Kernel(program, "computeRange");
@@ -59,7 +58,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
    unsigned int maxRangeSize = std::min(maxGroupSize, (unsigned int) computeRangeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
    unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
    unsigned int maxShortListSize = shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
-    isShortList = (length <= maxLocalBuffer && length < maxShortListSize);
+    // On Qualcomm's OpenCL, it's essential to check against maxShortListSize.  Otherwise you get a crash.
+    // But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
+    // maximum, so including the check hurts performance.  For the moment I'm going to just comment it out.
+    // If we officially support Qualcomm in the future, we'll need to do something better.
+    isShortList = (length <= maxLocalBuffer/* && length < maxShortListSize*/);
    for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2)
        ;
    positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize);

--- a/platforms/opencl/src/kernels/customIntegratorGlobal.cl
+++ b/platforms/opencl/src/kernels/customIntegratorGlobal.cl
 __kernel void computeGlobal(__global mixed2* restrict dt, __global mixed* restrict globals, __global mixed* restrict params,
-        float uniform, float gaussian, __global const real* restrict energy) {
+        float uniform, float gaussian, const real energy) {
    COMPUTE_STEP
 }
--- a/platforms/opencl/src/kernels/customIntegratorPerDof.cl
+++ b/platforms/opencl/src/kernels/customIntegratorPerDof.cl
@@ -26,7 +26,7 @@ void storePos(__global real4* restrict posq, __global real4* restrict posqCorrec
 __kernel void computePerDof(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta,
        __global mixed4* restrict velm, __global const real4* restrict force, __global const mixed2* restrict dt, __global const mixed* restrict globals,
        __global const mixed* restrict params, __global mixed* restrict sum, __global const float4* restrict gaussianValues,
-        unsigned int gaussianBaseIndex, __global const float4* restrict uniformValues, __global const real* restrict energy
+        unsigned int gaussianBaseIndex, __global const float4* restrict uniformValues, const real energy
        PARAMETER_ARGUMENTS) {
    mixed stepSize = dt[0].y;
    int index = get_global_id(0);

--- a/platforms/opencl/src/kernels/fft.cl
+++ b/platforms/opencl/src/kernels/fft.cl
@@ -2,26 +2,57 @@ real2 multiplyComplex(real2 c1, real2 c2) {
    return (real2) (c1.x*c2.x-c1.y*c2.y, c1.x*c2.y+c1.y*c2.x);
 }

+/**
+ * Load a value from the half-complex grid produces by a real-to-complex transform.
+ */
+real2 loadComplexValue(__global const real2* restrict in, int x, int y, int z) {
+    const int inputZSize = ZSIZE/2+1;
+    if (z < inputZSize)
+        return in[x*YSIZE*inputZSize+y*inputZSize+z];
+    int xp = (x == 0 ? 0 : XSIZE-x);
+    int yp = (y == 0 ? 0 : YSIZE-y);
+    real2 value = in[xp*YSIZE*inputZSize+yp*inputZSize+(ZSIZE-z)];
+    return (real2) (value.x, -value.y);
+}
+
 /**
 * Perform a 1D FFT on each row along one axis.
 */

-__kernel void execFFT(__global const real2* restrict in, __global real2* restrict out, int sign, __local real2* restrict w,
+__kernel void execFFT(__global const INPUT_TYPE* restrict in, __global OUTPUT_TYPE* restrict out, __local real2* restrict w,
        __local real2* restrict data0, __local real2* restrict data1) {
    for (int i = get_local_id(0); i < ZSIZE; i += get_local_size(0))
-        w[i] = (real2) (cos(-sign*i*2*M_PI/ZSIZE), sin(-sign*i*2*M_PI/ZSIZE));
+        w[i] = (real2) (cos(-(SIGN)*i*2*M_PI/ZSIZE), sin(-(SIGN)*i*2*M_PI/ZSIZE));
    barrier(CLK_LOCAL_MEM_FENCE);
    
    for (int baseIndex = get_group_id(0)*BLOCKS_PER_GROUP; baseIndex < XSIZE*YSIZE; baseIndex += get_num_groups(0)*BLOCKS_PER_GROUP) {
        int index = baseIndex+get_local_id(0)/ZSIZE;
        int x = index/YSIZE;
        int y = index-x*YSIZE;
+#if OUTPUT_IS_PACKED
+        if (x < XSIZE/2+1) {
+#endif
 #if LOOP_REQUIRED
        for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))
+    #if INPUT_IS_REAL
+            data0[z] = (real2) (in[x*(YSIZE*ZSIZE)+y*ZSIZE+z], 0);
+    #elif INPUT_IS_PACKED
+            data0[z] = loadComplexValue(in, x, y, z);
+    #else
            data0[z] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+z];
+    #endif
 #else
        if (index < XSIZE*YSIZE)
+    #if INPUT_IS_REAL
+            data0[get_local_id(0)] = (real2) (in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE], 0);
+    #elif INPUT_IS_PACKED
+            data0[get_local_id(0)] = loadComplexValue(in, x, y, get_local_id(0)%ZSIZE);
+    #else
            data0[get_local_id(0)] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE];
+    #endif
+#endif
+#if OUTPUT_IS_PACKED
+        }
 #endif
        barrier(CLK_LOCAL_MEM_FENCE);
        COMPUTE_FFT

--- a/platforms/opencl/src/kernels/fftR2C.cl
+++ b/platforms/opencl/src/kernels/fftR2C.cl
+/**
+ * Combine the two halves of a real grid into a complex grid that is half as large.
+ */
+__kernel void packForwardData(__global const real* restrict in, __global real2* restrict out) {
+    const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
+    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
+        int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
+        int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
+        int y = remainder/PACKED_ZSIZE;
+        int z = remainder-y*PACKED_ZSIZE;
+#if PACKED_AXIS == 0
+        real2 value = (real2) (in[2*x*YSIZE*ZSIZE+y*ZSIZE+z], in[(2*x+1)*YSIZE*ZSIZE+y*ZSIZE+z]);
+#elif PACKED_AXIS == 1
+        real2 value = (real2) (in[x*YSIZE*ZSIZE+2*y*ZSIZE+z], in[x*YSIZE*ZSIZE+(2*y+1)*ZSIZE+z]);
+#else
+        real2 value = (real2) (in[x*YSIZE*ZSIZE+y*ZSIZE+2*z], in[x*YSIZE*ZSIZE+y*ZSIZE+(2*z+1)]);
+#endif
+        out[index] = value;
+    }
+}
+
+/**
+ * Split the transformed data back into a full sized, symmetric grid.
+ */
+__kernel void unpackForwardData(__global const real2* restrict in, __global real2* restrict out, __local real2* restrict w) {
+    // Compute the phase factors.
+    
+#if PACKED_AXIS == 0
+    for (int i = get_local_id(0); i < PACKED_XSIZE; i += get_local_size(0))
+        w[i] = (real2) (sin(i*2*M_PI/XSIZE), cos(i*2*M_PI/XSIZE));
+#elif PACKED_AXIS == 1
+    for (int i = get_local_id(0); i < PACKED_YSIZE; i += get_local_size(0))
+        w[i] = (real2) (sin(i*2*M_PI/YSIZE), cos(i*2*M_PI/YSIZE));
+#else
+    for (int i = get_local_id(0); i < PACKED_ZSIZE; i += get_local_size(0))
+        w[i] = (real2) (sin(i*2*M_PI/ZSIZE), cos(i*2*M_PI/ZSIZE));
+#endif
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Transform the data.
+    
+    const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
+    const int outputZSize = ZSIZE/2+1;
+    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
+        int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
+        int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
+        int y = remainder/PACKED_ZSIZE;
+        int z = remainder-y*PACKED_ZSIZE;
+        int xp = (x == 0 ? 0 : PACKED_XSIZE-x);
+        int yp = (y == 0 ? 0 : PACKED_YSIZE-y);
+        int zp = (z == 0 ? 0 : PACKED_ZSIZE-z);
+        real2 z1 = in[x*PACKED_YSIZE*PACKED_ZSIZE+y*PACKED_ZSIZE+z];
+        real2 z2 = in[xp*PACKED_YSIZE*PACKED_ZSIZE+yp*PACKED_ZSIZE+zp];
+#if PACKED_AXIS == 0
+        real2 wfac = w[x];
+#elif PACKED_AXIS == 1
+        real2 wfac = w[y];
+#else
+        real2 wfac = w[z];
+#endif
+        real2 output = (real2) ((z1.x+z2.x - wfac.x*(z1.x-z2.x) + wfac.y*(z1.y+z2.y))/2, (z1.y-z2.y - wfac.y*(z1.x-z2.x) - wfac.x*(z1.y+z2.y))/2);
+        if (z < outputZSize)
+            out[x*YSIZE*outputZSize+y*outputZSize+z] = output;
+        xp = (x == 0 ? 0 : XSIZE-x);
+        yp = (y == 0 ? 0 : YSIZE-y);
+        zp = (z == 0 ? 0 : ZSIZE-z);
+        if (zp < outputZSize) {
+#if PACKED_AXIS == 0
+            if (x == 0)
+                out[PACKED_XSIZE*YSIZE*outputZSize+yp*outputZSize+zp] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
+#elif PACKED_AXIS == 1
+            if (y == 0)
+                out[xp*YSIZE*outputZSize+PACKED_YSIZE*outputZSize+zp] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
+#else
+            if (z == 0)
+                out[xp*YSIZE*outputZSize+yp*outputZSize+PACKED_ZSIZE] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
+#endif
+            else
+                out[xp*YSIZE*outputZSize+yp*outputZSize+zp] = (real2) (output.x, -output.y);
+        }
+    }
+}
+
+/**
+ * Load a value from the half-complex grid produced by a real-to-complex transform.
+ */
+real2 loadComplexValue(__global const real2* restrict in, int x, int y, int z) {
+    const int inputZSize = ZSIZE/2+1;
+    if (z < inputZSize)
+        return in[x*YSIZE*inputZSize+y*inputZSize+z];
+    int xp = (x == 0 ? 0 : XSIZE-x);
+    int yp = (y == 0 ? 0 : YSIZE-y);
+    real2 value = in[xp*YSIZE*inputZSize+yp*inputZSize+(ZSIZE-z)];
+    return (real2) (value.x, -value.y);
+}
+
+/**
+ * Repack the symmetric complex grid into one half as large in preparation for doing an inverse complex-to-real transform.
+ */
+__kernel void packBackwardData(__global const real2* restrict in, __global real2* restrict out, __local real2* restrict w) {
+    // Compute the phase factors.
+    
+#if PACKED_AXIS == 0
+    for (int i = get_local_id(0); i < PACKED_XSIZE; i += get_local_size(0))
+        w[i] = (real2) (cos(i*2*M_PI/XSIZE), sin(i*2*M_PI/XSIZE));
+#elif PACKED_AXIS == 1
+    for (int i = get_local_id(0); i < PACKED_YSIZE; i += get_local_size(0))
+        w[i] = (real2) (cos(i*2*M_PI/YSIZE), sin(i*2*M_PI/YSIZE));
+#else
+    for (int i = get_local_id(0); i < PACKED_ZSIZE; i += get_local_size(0))
+        w[i] = (real2) (cos(i*2*M_PI/ZSIZE), sin(i*2*M_PI/ZSIZE));
+#endif
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Transform the data.
+    
+    const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
+    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
+        int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
+        int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
+        int y = remainder/PACKED_ZSIZE;
+        int z = remainder-y*PACKED_ZSIZE;
+        int xp = (x == 0 ? 0 : PACKED_XSIZE-x);
+        int yp = (y == 0 ? 0 : PACKED_YSIZE-y);
+        int zp = (z == 0 ? 0 : PACKED_ZSIZE-z);
+        real2 z1 = loadComplexValue(in, x, y, z);
+#if PACKED_AXIS == 0
+        real2 wfac = w[x];
+        real2 z2 = loadComplexValue(in, PACKED_XSIZE-x, yp, zp);
+#elif PACKED_AXIS == 1
+        real2 wfac = w[y];
+        real2 z2 = loadComplexValue(in, xp, PACKED_YSIZE-y, zp);
+#else
+        real2 wfac = w[z];
+        real2 z2 = loadComplexValue(in, xp, yp, PACKED_ZSIZE-z);
+#endif
+        real2 even = (real2) ((z1.x+z2.x)/2, (z1.y-z2.y)/2);
+        real2 odd = (real2) ((z1.x-z2.x)/2, (z1.y+z2.y)/2);
+        odd = (real2) (odd.x*wfac.x-odd.y*wfac.y, odd.y*wfac.x+odd.x*wfac.y);
+        out[x*PACKED_YSIZE*PACKED_ZSIZE+y*PACKED_ZSIZE+z] = (real2) (even.x-odd.y, even.y+odd.x);
+    }
+}
+
+/**
+ * Split the data back into a full sized, real grid after an inverse transform.
+ */
+__kernel void unpackBackwardData(__global const real2* restrict in, __global real* restrict out) {
+    const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
+    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
+        int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
+        int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
+        int y = remainder/PACKED_ZSIZE;
+        int z = remainder-y*PACKED_ZSIZE;
+        real2 value = 2*in[index];
+#if PACKED_AXIS == 0
+        out[2*x*YSIZE*ZSIZE+y*ZSIZE+z] = value.x;
+        out[(2*x+1)*YSIZE*ZSIZE+y*ZSIZE+z] = value.y;
+#elif PACKED_AXIS == 1
+        out[x*YSIZE*ZSIZE+2*y*ZSIZE+z] = value.x;
+        out[x*YSIZE*ZSIZE+(2*y+1)*ZSIZE+z] = value.y;
+#else
+        out[x*YSIZE*ZSIZE+y*ZSIZE+2*z] = value.x;
+        out[x*YSIZE*ZSIZE+y*ZSIZE+(2*z+1)] = value.y;
+#endif
+    }
+}
--- a/platforms/opencl/src/kernels/findInteractingBlocks.cl
+++ b/platforms/opencl/src/kernels/findInteractingBlocks.cl
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
-#define BUFFER_SIZE 256

 /**
 * Find a bounding box for the atoms in each block.
@@ -65,6 +64,10 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
    }
 }

+#if SIMD_WIDTH <= 32
+
+#define BUFFER_SIZE 256
+
 __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        __global unsigned int* restrict interactionCount, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms,
        __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global real2* restrict sortedBlocks,
@@ -133,8 +136,8 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
            int block2 = block2Base+indexInWarp;
            bool includeBlock2 = (block2 < NUM_BLOCKS);
            if (includeBlock2) {
-                real4 blockCenterY = (block2 < NUM_BLOCKS ? sortedBlockCenter[block2] : (real4) (0));
-                real4 blockSizeY = (block2 < NUM_BLOCKS ? sortedBlockBoundingBox[block2] : (real4) (0));
+                real4 blockCenterY = sortedBlockCenter[block2];
+                real4 blockSizeY = sortedBlockBoundingBox[block2];
                real4 blockDelta = blockCenterX-blockCenterY;
 #ifdef USE_PERIODIC
                APPLY_PERIODIC_TO_DELTA(blockDelta)
@@ -252,3 +255,279 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0))
        oldPositions[i] = posq[i];
 }
+
+#else
+// This is the old implementation of finding interacting blocks.  It is quite a bit more complicated,
+// and slower on most GPUs.  On AMD, however, it is faster, so we keep it around to use there.
+
+#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
+#define WARP_SIZE 32
+#define INVALID 0xFFFF
+
+/**
+ * Perform a parallel prefix sum over an array.  The input values are all assumed to be 0 or 1.
+ */
+void prefixSum(__local short* sum, __local ushort2* temp) {
+    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
+        temp[i].x = sum[i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    int whichBuffer = 0;
+    for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) {
+        if (whichBuffer == 0)
+            for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
+                temp[i].y = (i < offset ? temp[i].x : temp[i].x+temp[i-offset].x);
+        else
+            for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
+                temp[i].x = (i < offset ? temp[i].y : temp[i].y+temp[i-offset].y);
+        whichBuffer = 1-whichBuffer;
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (whichBuffer == 0)
+        for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
+            sum[i] = temp[i].x;
+    else
+        for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
+            sum[i] = temp[i].y;
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+/**
+ * This is called by findBlocksWithInteractions().  It compacts the list of blocks, identifies interactions
+ * in them, and writes the result to global memory.
+ */
+void storeInteractionData(int x, __local unsigned short* buffer, __local short* sum, __local ushort2* temp, __local int* atoms, __local int* numAtoms,
+            __local int* baseIndex, __global unsigned int* interactionCount, __global int* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize,
+            real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, __global const real4* posq, __local real4* posBuffer,
+            real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
+    const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
+                                     0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
+                                     0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
+    if (get_local_id(0) < TILE_SIZE) {
+        real4 pos = posq[x*TILE_SIZE+get_local_id(0)];
+#ifdef USE_PERIODIC
+        if (singlePeriodicCopy) {
+            // The box is small enough that we can just translate all the atoms into a single periodic
+            // box, then skip having to apply periodic boundary conditions later.
+            
+            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, blockCenterX)
+        }
+#endif
+        posBuffer[get_local_id(0)] = pos;
+    }
+    
+    // The buffer is full, so we need to compact it and write out results.  Start by doing a parallel prefix sum.
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
+        sum[i] = (buffer[i] == INVALID ? 0 : 1);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    prefixSum(sum, temp);
+    int numValid = sum[BUFFER_SIZE-1];
+
+    // Compact the buffer.
+
+    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
+        if (buffer[i] != INVALID)
+            temp[sum[i]-1].x = buffer[i];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
+        buffer[i] = temp[i].x;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Loop over the tiles and find specific interactions in them.
+
+    const int indexInWarp = get_local_id(0)%WARP_SIZE;
+    for (int base = 0; base < numValid; base += BUFFER_SIZE/WARP_SIZE) {
+        for (int i = get_local_id(0)/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE && base+i < numValid; i += GROUP_SIZE/WARP_SIZE) {
+            // Check each atom in block Y for interactions.
+            
+            real4 pos = posq[buffer[base+i]*TILE_SIZE+indexInWarp];
+#ifdef USE_PERIODIC
+            if (singlePeriodicCopy)
+                APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, blockCenterX)
+#endif
+            bool interacts = false;
+#ifdef USE_PERIODIC
+            if (!singlePeriodicCopy) {
+                for (int j = 0; j < TILE_SIZE; j++) {
+                    real4 delta = pos-posBuffer[j];
+                    APPLY_PERIODIC_TO_DELTA(delta)
+                    interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
+                }
+            }
+            else {
+#endif
+                for (int j = 0; j < TILE_SIZE; j++) {
+                    real4 delta = pos-posBuffer[j];
+                    interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
+                }
+#ifdef USE_PERIODIC
+            }
+#endif
+            sum[i*WARP_SIZE+indexInWarp] = (interacts ? 1 : 0);
+        }
+        for (int i = numValid-base+get_local_id(0)/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE; i += GROUP_SIZE/WARP_SIZE)
+            sum[i*WARP_SIZE+indexInWarp] = 0;
+
+        // Compact the list of atoms.
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+        prefixSum(sum, temp);
+        for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
+            if (sum[i] != (i == 0 ? 0 : sum[i-1]))
+                atoms[*numAtoms+sum[i]-1] = buffer[base+i/WARP_SIZE]*TILE_SIZE+indexInWarp;
+
+        // Store them to global memory.
+
+        int atomsToStore = *numAtoms+sum[BUFFER_SIZE-1];
+        bool storePartialTile = (finish && base >= numValid-BUFFER_SIZE/WARP_SIZE);
+        int tilesToStore = (storePartialTile ? (atomsToStore+TILE_SIZE-1)/TILE_SIZE : atomsToStore/TILE_SIZE);
+        if (tilesToStore > 0) {
+            if (get_local_id(0) == 0)
+                *baseIndex = atom_add(interactionCount, tilesToStore);
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if (get_local_id(0) == 0)
+                *numAtoms = atomsToStore-tilesToStore*TILE_SIZE;
+            if (*baseIndex+tilesToStore <= maxTiles) {
+                if (get_local_id(0) < tilesToStore)
+                    interactingTiles[*baseIndex+get_local_id(0)] = x;
+                for (int i = get_local_id(0); i < tilesToStore*TILE_SIZE; i += get_local_size(0))
+                    interactingAtoms[*baseIndex*TILE_SIZE+i] = (i < atomsToStore ? atoms[i] : NUM_ATOMS);
+            }
+        }
+        else {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if (get_local_id(0) == 0)
+                *numAtoms += sum[BUFFER_SIZE-1];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (get_local_id(0) < *numAtoms && !storePartialTile)
+            atoms[get_local_id(0)] = atoms[tilesToStore*TILE_SIZE+get_local_id(0)];
+    }
+
+    if (numValid == 0 && *numAtoms > 0 && finish) {
+        // We didn't have any more tiles to process, but there were some atoms left over from a
+        // previous call to this function.  Save them now.
+
+        if (get_local_id(0) == 0)
+            *baseIndex = atom_add(interactionCount, 1);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (*baseIndex < maxTiles) {
+            if (get_local_id(0) == 0)
+                interactingTiles[*baseIndex] = x;
+            if (get_local_id(0) < TILE_SIZE)
+                interactingAtoms[*baseIndex*TILE_SIZE+get_local_id(0)] = (get_local_id(0) < *numAtoms ? atoms[get_local_id(0)] : NUM_ATOMS);
+        }
+    }
+
+    // Reset the buffer for processing more tiles.
+
+    for (int i = get_local_id(0); i < BUFFER_SIZE; i += get_local_size(0))
+        buffer[i] = INVALID;
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+/**
+ * Compare the bounding boxes for each pair of blocks.  If they are sufficiently far apart,
+ * mark them as non-interacting.
+ */
+__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        __global unsigned int* restrict interactionCount, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms,
+        __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global real2* restrict sortedBlocks,
+        __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
+        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
+        __global const int* restrict rebuildNeighborList) {
+    __local unsigned short buffer[BUFFER_SIZE];
+    __local short sum[BUFFER_SIZE];
+    __local ushort2 temp[BUFFER_SIZE];
+    __local int atoms[BUFFER_SIZE+TILE_SIZE];
+    __local real4 posBuffer[TILE_SIZE];
+    __local int exclusionsForX[MAX_EXCLUSIONS];
+    __local int bufferFull;
+    __local int globalIndex;
+    __local int numAtoms;
+#ifdef AMD_ATOMIC_WORK_AROUND
+    // Do a byte write to force all memory accesses to interactionCount to use the complete path.
+    // This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
+    // The IF actually causes the write to never be executed, its presence is all that is needed.
+    // AMD APP SDK 2.4 has this problem.
+    if (get_global_id(0) == get_local_id(0)+1)
+        ((__global char*)interactionCount)[sizeof(unsigned int)+1] = 0;
+#endif
+
+    if (rebuildNeighborList[0] == 0)
+        return; // The neighbor list doesn't need to be rebuilt.
+
+    int valuesInBuffer = 0;
+    if (get_local_id(0) == 0)
+        bufferFull = false;
+    for (int i = 0; i < BUFFER_GROUPS; ++i)
+        buffer[i*GROUP_SIZE+get_local_id(0)] = INVALID;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    // Loop over blocks sorted by size.
+    
+    for (int i = startBlockIndex+get_group_id(0); i < startBlockIndex+numBlocks; i += get_num_groups(0)) {
+        if (get_local_id(0) == get_local_size(0)-1)
+            numAtoms = 0;
+        real2 sortedKey = sortedBlocks[i];
+        int x = (int) sortedKey.y;
+        real4 blockCenterX = sortedBlockCenter[i];
+        real4 blockSizeX = sortedBlockBoundingBox[i];
+
+        // Load exclusion data for block x.
+        
+        const int exclusionStart = exclusionRowIndices[x];
+        const int exclusionEnd = exclusionRowIndices[x+1];
+        const int numExclusions = exclusionEnd-exclusionStart;
+        for (int j = get_local_id(0); j < numExclusions; j += get_local_size(0))
+            exclusionsForX[j] = exclusionIndices[exclusionStart+j];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        
+        // Compare it to other blocks after this one in sorted order.
+        
+        for (int base = i+1; base < NUM_BLOCKS; base += get_local_size(0)) {
+            int j = base+get_local_id(0);
+            real2 sortedKey2 = (j < NUM_BLOCKS ? sortedBlocks[j] : (real2) 0);
+            real4 blockCenterY = (j < NUM_BLOCKS ? sortedBlockCenter[j] : (real4) 0);
+            real4 blockSizeY = (j < NUM_BLOCKS ? sortedBlockBoundingBox[j] : (real4) 0);
+            unsigned short y = (unsigned short) sortedKey2.y;
+            real4 delta = blockCenterX-blockCenterY;
+#ifdef USE_PERIODIC
+            APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+            delta.x = max((real) 0, fabs(delta.x)-blockSizeX.x-blockSizeY.x);
+            delta.y = max((real) 0, fabs(delta.y)-blockSizeX.y-blockSizeY.y);
+            delta.z = max((real) 0, fabs(delta.z)-blockSizeX.z-blockSizeY.z);
+            bool hasExclusions = false;
+            for (int k = 0; k < numExclusions; k++)
+                hasExclusions |= (exclusionsForX[k] == y);
+            if (j < NUM_BLOCKS && delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED && !hasExclusions) {
+                // Add this tile to the buffer.
+
+                int bufferIndex = valuesInBuffer*GROUP_SIZE+get_local_id(0);
+                buffer[bufferIndex] = y;
+                valuesInBuffer++;
+                if (!bufferFull && valuesInBuffer == BUFFER_GROUPS)
+                    bufferFull = true;
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if (bufferFull) {
+                storeInteractionData(x, buffer, sum, temp, atoms, &numAtoms, &globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, false);
+                valuesInBuffer = 0;
+                if (get_local_id(0) == 0)
+                    bufferFull = false;
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+        storeInteractionData(x, buffer, sum, temp, atoms, &numAtoms, &globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, true);
+    }
+    
+    // Record the positions the neighbor list is based on.
+    
+    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0))
+        oldPositions[i] = posq[i];
+}
+
+#endif
+
--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
@@ -138,35 +138,34 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
                    real add = pos.w*data[ix].x*data[iy].y*data[iz].z;
-#ifdef USE_DOUBLE_PRECISION
-                    atom_add(&pmeGrid[2*index], (long) (add*0x100000000));
+#ifdef USE_ALTERNATE_MEMORY_ACCESS_PATTERN
+                    // On Nvidia devices (at least Maxwell anyway), this split ordering produces much higher performance.  Why?
+                    // I have no idea!  And of course on AMD it produces slower performance.  GPUs are not meant to be understood.
+                    atom_add(&pmeGrid[index%2 == 0 ? index/2 : (index+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2], (long) (add*0x100000000));
 #else
                    atom_add(&pmeGrid[index], (long) (add*0x100000000));
 #endif
-
                }
            }
        }
    }
 }

-__kernel void finishSpreadCharge(__global long* restrict pmeGrid) {
-    __global real2* realGrid = (__global real2*) pmeGrid;
+__kernel void finishSpreadCharge(__global long* restrict fixedGrid, __global real* restrict realGrid) {
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    real scale = EPSILON_FACTOR/(real) 0x100000000;
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
-#ifdef USE_DOUBLE_PRECISION
-        long value = pmeGrid[2*index];
+#ifdef USE_ALTERNATE_MEMORY_ACCESS_PATTERN
+        long value = fixedGrid[index%2 == 0 ? index/2 : (index+gridSize)/2];
 #else
-        long value = pmeGrid[index];
+        long value = fixedGrid[index];
 #endif
-        real2 realValue = (real2) ((real) (value*scale), 0);
-        realGrid[index] = realValue;
+        realGrid[index] = (real) (value*scale);
    }
 }
 #elif defined(DEVICE_IS_CPU)
 __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
-        __global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
+        __global real* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
    const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0);
    const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0);
    if (firstx == lastx)
@@ -230,7 +229,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
-                    pmeGrid[index].x += EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z;
+                    pmeGrid[index] += EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z;
                }
            }
        }
@@ -238,7 +237,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
 }
 #else
 __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
-        __global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
+        __global real* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
    unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
        // Compute the charge on a grid point.
@@ -290,22 +289,23 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
                }
            }
        }
-        pmeGrid[gridIndex] = (real2) (result*EPSILON_FACTOR, 0);
+        pmeGrid[gridIndex] = result*EPSILON_FACTOR;
    }
 }
 #endif

-__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global real* restrict energyBuffer, __global const real* restrict pmeBsplineModuliX,
-        __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, real recipScaleFactor) {
-    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
-    real energy = 0.0f;
+__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global const real* restrict pmeBsplineModuliX,
+        __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
+    // R2C stores into a half complex matrix where the last dimension is cut by half
+    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
+    const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
+
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
-        int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z);
-        int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z;
-        int ky = remainder/GRID_SIZE_Z;
-        int kz = remainder-ky*GRID_SIZE_Z;
-        if (kx == 0 && ky == 0 && kz == 0)
-            continue;
+        // real indices
+        int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
+        int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
+        int ky = remainder/(GRID_SIZE_Z/2+1);
+        int kz = remainder-ky*(GRID_SIZE_Z/2+1);
        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
@@ -319,13 +319,53 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global r
        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
+        if (kx != 0 || ky != 0 || kz != 0) {
            pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
+        }
+    }
+}
+
+__kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global real* restrict energyBuffer,
+                      __global const real* restrict pmeBsplineModuliX, __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ,
+                      real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
+    // R2C stores into a half complex matrix where the last dimension is cut by half
+    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
+    const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
+ 
+    real energy = 0;
+    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
+        // real indices
+        int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));
+        int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z);
+        int ky = remainder/(GRID_SIZE_Z);
+        int kz = remainder-ky*(GRID_SIZE_Z);
+        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
+        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
+        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
+        real mhx = mx*recipBoxVecX.x;
+        real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
+        real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
+        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
+        real bx = pmeBsplineModuliX[kx];
+        real by = pmeBsplineModuliY[ky];
+        real bz = pmeBsplineModuliZ[kz];
+        real denom = m2*bx*by*bz;
+        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
+        if (kz >= (GRID_SIZE_Z/2+1)) {
+            kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
+            ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
+            kz = GRID_SIZE_Z-kz;
+        } 
+        int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
+        real2 grid = pmeGrid[indexInHalfComplexGrid];
+        if (kx != 0 || ky != 0 || kz != 0) {
            energy += eterm*(grid.x*grid.x + grid.y*grid.y);
        }
+    }
    energyBuffer[get_global_id(0)] += 0.5f*energy;
 }

-__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real2* restrict pmeGrid,
+__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real* restrict pmeGrid,
        real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, __global int2* restrict pmeAtomGridIndex) {
    const real4 scale = 1/(real) (PME_ORDER-1);
    real4 data[PME_ORDER];
@@ -385,7 +425,7 @@ __kernel void gridInterpolateForce(__global const real4* restrict posq, __global
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
-                    real gridvalue = pmeGrid[index].x;
+                    real gridvalue = pmeGrid[index];
                    force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue;
                    force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue;
                    force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;

--- a/platforms/opencl/src/kernels/sort.cl
+++ b/platforms/opencl/src/kernels/sort.cl
@@ -109,13 +109,7 @@ __kernel void assignElementsToBuckets(__global const DATA_TYPE* restrict data, u
    float maxValue = (float) (range[1]);
    float bucketWidth = (maxValue-minValue)/numBuckets;
    for (uint index = get_global_id(0); index < length; index += get_global_size(0)) {
-#if defined(MAC_AMD_WORKAROUND) && VALUE_IS_INT2
-        __global int* d = (__global int*) data;
-        int2 element = (int2) (d[2*index], d[2*index+1]);
-        float key = (float) getValue(element);
-#else
        float key = (float) getValue(data[index]);
-#endif
        uint bucketIndex = min((uint) ((key-minValue)/bucketWidth), numBuckets-1);
        offsetInBucket[index] = atom_inc(&bucketOffset[bucketIndex]);
        bucketOfElement[index] = bucketIndex;

--- a/platforms/opencl/tests/TestOpenCLCustomIntegrator.cpp
+++ b/platforms/opencl/tests/TestOpenCLCustomIntegrator.cpp
@@ -334,7 +334,7 @@ void testMonteCarlo() {
    integrator.addComputePerDof("oldx", "x");
    integrator.addComputePerDof("x", "x+dt*gaussian");
    integrator.addComputeGlobal("accept", "step(exp((oldE-energy)/kT)-uniform)");
-    integrator.addComputePerDof("x", "accept*x + (1-accept)*oldx");
+    integrator.addComputePerDof("x", "select(accept, x, oldx)");
    HarmonicBondForce* forceField = new HarmonicBondForce();
    forceField->addBond(0, 1, 2.0, 10.0);
    system.addForce(forceField);

--- a/platforms/opencl/tests/TestOpenCLFFT.cpp
+++ b/platforms/opencl/tests/TestOpenCLFFT.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011 Stanford University and the Authors.           *
+ * Portions copyright (c) 2011-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -51,7 +51,7 @@ using namespace std;
 static OpenCLPlatform platform;

 template <class Real2>
-void testTransform() {
+void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
    System system;
    system.addParticle(0.0);
    OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false");
@@ -59,7 +59,6 @@ void testTransform() {
    context.initialize();
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);
-    int xsize = 28, ysize = 25, zsize = 30;
    vector<Real2> original(xsize*ysize*zsize);
    vector<t_complex> reference(original.size());
    for (int i = 0; i < (int) original.size(); i++) {
@@ -67,10 +66,16 @@ void testTransform() {
        original[i] = value;
        reference[i] = t_complex(value.x, value.y);
    }
+    for (int i = 0; i < (int) reference.size(); i++) {
+        if (realToComplex)
+            reference[i] = t_complex(i%2 == 0 ? original[i/2].x : original[i/2].y, 0);
+        else
+            reference[i] = t_complex(original[i].x, original[i].y);
+    }
    OpenCLArray grid1(context, original.size(), sizeof(Real2), "grid1");
    OpenCLArray grid2(context, original.size(), sizeof(Real2), "grid2");
    grid1.upload(original);
-    OpenCLFFT3D fft(context, xsize, ysize, zsize);
+    OpenCLFFT3D fft(context, xsize, ysize, zsize, realToComplex);

    // Perform a forward FFT, then verify the result is correct.

@@ -80,9 +85,14 @@ void testTransform() {
    fftpack_t plan;
    fftpack_init_3d(&plan, xsize, ysize, zsize);
    fftpack_exec_3d(plan, FFTPACK_FORWARD, &reference[0], &reference[0]);
-    for (int i = 0; i < (int) result.size(); ++i) {
-        ASSERT_EQUAL_TOL(reference[i].re, result[i].x, 1e-3);
-        ASSERT_EQUAL_TOL(reference[i].im, result[i].y, 1e-3);
+    int outputZSize = (realToComplex ? zsize/2+1 : zsize);
+    for (int x = 0; x < xsize; x++)
+        for (int y = 0; y < ysize; y++)
+            for (int z = 0; z < outputZSize; z++) {
+                int index1 = x*ysize*zsize + y*zsize + z;
+                int index2 = x*ysize*outputZSize + y*outputZSize + z;
+                ASSERT_EQUAL_TOL(reference[index1].re, result[index2].x, 1e-3);
+                ASSERT_EQUAL_TOL(reference[index1].im, result[index2].y, 1e-3);
            }
    fftpack_destroy(plan);

@@ -91,7 +101,8 @@ void testTransform() {
    fft.execFFT(grid2, grid1, false);
    grid1.download(result);
    double scale = 1.0/(xsize*ysize*zsize);
-    for (int i = 0; i < (int) result.size(); ++i) {
+    int valuesToCheck = (realToComplex ? original.size()/2 : original.size());
+    for (int i = 0; i < valuesToCheck; ++i) {
        ASSERT_EQUAL_TOL(original[i].x, scale*result[i].x, 1e-4);
        ASSERT_EQUAL_TOL(original[i].y, scale*result[i].y, 1e-4);
    }
@@ -101,10 +112,20 @@ int main(int argc, char* argv[]) {
    try {
        if (argc > 1)
            platform.setPropertyDefaultValue("OpenCLPrecision", string(argv[1]));
-        if (platform.getPropertyDefaultValue("OpenCLPrecision") == "double")
-            testTransform<mm_double2>();
-        else
-            testTransform<mm_float2>();
+        if (platform.getPropertyDefaultValue("OpenCLPrecision") == "double") {
+            testTransform<mm_double2>(false, 28, 25, 30);
+            testTransform<mm_double2>(true, 28, 25, 25);
+            testTransform<mm_double2>(true, 25, 28, 25);
+            testTransform<mm_double2>(true, 25, 25, 28);
+            testTransform<mm_double2>(true, 21, 25, 27);
+        }
+        else {
+            testTransform<mm_float2>(false, 28, 25, 30);
+            testTransform<mm_float2>(true, 28, 25, 25);
+            testTransform<mm_float2>(true, 25, 28, 25);
+            testTransform<mm_float2>(true, 25, 25, 28);
+            testTransform<mm_float2>(true, 21, 25, 27);
+        }
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/opencl/tests/TestOpenCLLocalEnergyMinimizer.cpp
+++ b/platforms/opencl/tests/TestOpenCLLocalEnergyMinimizer.cpp
@@ -7,7 +7,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010-2014 Stanford University and the Authors.      *
+ * Portions copyright (c) 2010-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -81,7 +81,7 @@ void testLargeSystem() {
    const int numParticles = numMolecules*2;
    const double cutoff = 2.0;
    const double boxSize = 4.0;
-    const double tolerance = 5;
+    const double tolerance = 10;
    System system;
    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
    NonbondedForce* nonbonded = new NonbondedForce();
@@ -114,7 +114,7 @@ void testLargeSystem() {
    State finalState = context.getState(State::Forces | State::Energy | State::Positions);
    ASSERT(finalState.getPotentialEnergy() < initialState.getPotentialEnergy());

-    // Compute the force magnitude, substracting off any component parallel to a constraint, and
+    // Compute the force magnitude, subtracting off any component parallel to a constraint, and
    // check that it satisfies the requested tolerance.

    double forceNorm = 0.0;
@@ -129,8 +129,8 @@ void testLargeSystem() {
        f -= dir*dir.dot(f);
        forceNorm += f.dot(f);
    }
-    forceNorm = sqrt(forceNorm/(4*numMolecules));
-    ASSERT(forceNorm < 3*tolerance);
+    forceNorm = sqrt(forceNorm/(5*numMolecules));
+    ASSERT(forceNorm < 2*tolerance);
 }

 void testVirtualSites() {
@@ -138,7 +138,7 @@ void testVirtualSites() {
    const int numParticles = numMolecules*3;
    const double cutoff = 2.0;
    const double boxSize = 4.0;
-    const double tolerance = 5;
+    const double tolerance = 10;
    System system;
    system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
    NonbondedForce* nonbonded = new NonbondedForce();
@@ -195,8 +195,8 @@ void testVirtualSites() {
        
        ASSERT_EQUAL_VEC((finalState.getPositions()[i+1]+finalState.getPositions()[i])*0.5, finalState.getPositions()[i+2], 1e-5);
    }
-    forceNorm = sqrt(forceNorm/(4*numMolecules));
-    ASSERT(forceNorm < 3*tolerance);
+    forceNorm = sqrt(forceNorm/(5*numMolecules));
+    ASSERT(forceNorm < 2*tolerance);
 }

 int main(int argc, char* argv[]) {

--- a/platforms/reference/src/SimTKReference/ObcParameters.cpp
+++ b/platforms/reference/src/SimTKReference/ObcParameters.cpp
@@ -382,9 +382,9 @@ void ObcParameters::setPeriodic(OpenMM::RealVec* vectors) {

    assert(_cutoff);

-    assert(boxSize[0][0] >= 2.0*_cutoffDistance);
-    assert(boxSize[1][1] >= 2.0*_cutoffDistance);
-    assert(boxSize[2][2] >= 2.0*_cutoffDistance);
+    assert(vectors[0][0] >= 2.0*_cutoffDistance);
+    assert(vectors[1][1] >= 2.0*_cutoffDistance);
+    assert(vectors[2][2] >= 2.0*_cutoffDistance);

    _periodic           = true;
    _periodicBoxVectors[0] = vectors[0];

--- a/platforms/reference/tests/TestReferenceCustomIntegrator.cpp
+++ b/platforms/reference/tests/TestReferenceCustomIntegrator.cpp
@@ -334,7 +334,7 @@ void testMonteCarlo() {
    integrator.addComputePerDof("oldx", "x");
    integrator.addComputePerDof("x", "x+dt*gaussian");
    integrator.addComputeGlobal("accept", "step(exp((oldE-energy)/kT)-uniform)");
-    integrator.addComputePerDof("x", "accept*x + (1-accept)*oldx");
+    integrator.addComputePerDof("x", "select(accept, x, oldx)");
    HarmonicBondForce* forceField = new HarmonicBondForce();
    forceField->addBond(0, 1, 2.0, 10.0);
    system.addForce(forceField);

--- a/platforms/reference/tests/TestReferenceLocalEnergyMinimizer.cpp
+++ b/platforms/reference/tests/TestReferenceLocalEnergyMinimizer.cpp
@@ -7,7 +7,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010-2014 Stanford University and the Authors.      *
+ * Portions copyright (c) 2010-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -129,8 +129,8 @@ void testLargeSystem() {
        f -= dir*dir.dot(f);
        forceNorm += f.dot(f);
    }
-    forceNorm = sqrt(forceNorm/(4*numMolecules));
-    ASSERT(forceNorm < 3*tolerance);
+    forceNorm = sqrt(forceNorm/(5*numMolecules));
+    ASSERT(forceNorm < 2*tolerance);
 }

 void testVirtualSites() {
@@ -195,8 +195,8 @@ void testVirtualSites() {
        
        ASSERT_EQUAL_VEC((finalState.getPositions()[i+1]+finalState.getPositions()[i])*0.5, finalState.getPositions()[i+2], 1e-5);
    }
-    forceNorm = sqrt(forceNorm/(4*numMolecules));
-    ASSERT(forceNorm < 3*tolerance);
+    forceNorm = sqrt(forceNorm/(5*numMolecules));
+    ASSERT(forceNorm < 2*tolerance);
 }

 int main() {