Merge remote-tracking branch 'upstream/master'

71d33617 · John Chodera (MSKCC) · eb232608 · 9da36463 · 71d33617 · 71d33617
Commit 71d33617 authored Apr 24, 2015 by John Chodera (MSKCC)
20 changed files
--- a/platforms/opencl/include/OpenCLParallelKernels.h
+++ b/platforms/opencl/include/OpenCLParallelKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011-2013 Stanford University and the Authors.      *
+ * Portions copyright (c) 2011-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -344,6 +344,13 @@ public:
     * @return the potential energy due to the force
     */
    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the CMAPTorsionForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force);
 private:
    class Task;
    OpenCLPlatform::PlatformData& data;

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -106,9 +106,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
                // if they supplied a valid deviceIndex, we only look through that one
                if (i != deviceIndex && deviceIndex >= 0 && deviceIndex < (int) devices.size())
                    continue;
-
-                if (platformVendor == "Apple" && (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU || devices[i].getInfo<CL_DEVICE_VENDOR>() == "AMD"))
-                    continue; // The CPU device on OS X won't work correctly, and there are serious bugs using AMD GPUs.
+                if (platformVendor == "Apple" && (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU))
+                    continue; // The CPU device on OS X won't work correctly.
                int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
                int processingElementsPerComputeUnit = 8;
                if (devices[i].getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
@@ -170,6 +169,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
        if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel")
            defaultOptimizationOptions = "";
+        else if (platformVendor == "Apple")
+            defaultOptimizationOptions = "-cl-mad-enable -cl-no-signed-zeros";
        else
            defaultOptimizationOptions = "-cl-fast-relaxed-math";
        supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos);
@@ -241,8 +242,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
        }
        else
            simdWidth = 1;
-        if (platformVendor == "Apple" && vendor == "AMD")
-            compilationDefines["MAC_AMD_WORKAROUND"] = "";
        if (supports64BitGlobalAtomics)
            compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
        if (supportsDoublePrecision)

--- a/platforms/opencl/src/OpenCLFFT3D.cpp
+++ b/platforms/opencl/src/OpenCLFFT3D.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -35,25 +35,109 @@
 using namespace OpenMM;
 using namespace std;

-OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize) : context(context), xsize(xsize), ysize(ysize), zsize(zsize) {
-    zkernel = createKernel(xsize, ysize, zsize, zthreads);
-    xkernel = createKernel(ysize, zsize, xsize, xthreads);
-    ykernel = createKernel(zsize, xsize, ysize, ythreads);
+OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize, bool realToComplex) :
+        context(context), xsize(xsize), ysize(ysize), zsize(zsize) {
+    packRealAsComplex = false;
+    int packedXSize = xsize;
+    int packedYSize = ysize;
+    int packedZSize = zsize;
+    if (realToComplex) {
+        // If any axis size is even, we can pack the real values into a complex grid that is only half as large.
+        // Look for an appropriate axis.
+        
+        packRealAsComplex = true;
+        int packedAxis, bufferSize;
+        if (xsize%2 == 0) {
+            packedAxis = 0;
+            packedXSize /= 2;
+            bufferSize = packedXSize;
+        }
+        else if (ysize%2 == 0) {
+            packedAxis = 1;
+            packedYSize /= 2;
+            bufferSize = packedYSize;
+        }
+        else if (zsize%2 == 0) {
+            packedAxis = 2;
+            packedZSize /= 2;
+            bufferSize = packedZSize;
+        }
+        else
+            packRealAsComplex = false;
+        if (packRealAsComplex) {
+            // Build the kernels for packing and unpacking the data.
+            
+            map<string, string> defines;
+            defines["XSIZE"] = context.intToString(xsize);
+            defines["YSIZE"] = context.intToString(ysize);
+            defines["ZSIZE"] = context.intToString(zsize);
+            defines["PACKED_AXIS"] = context.intToString(packedAxis);
+            defines["PACKED_XSIZE"] = context.intToString(packedXSize);
+            defines["PACKED_YSIZE"] = context.intToString(packedYSize);
+            defines["PACKED_ZSIZE"] = context.intToString(packedZSize);
+            defines["M_PI"] = context.doubleToString(M_PI);
+            cl::Program program = context.createProgram(OpenCLKernelSources::fftR2C, defines);
+            packForwardKernel = cl::Kernel(program, "packForwardData");
+            unpackForwardKernel = cl::Kernel(program, "unpackForwardData");
+            unpackForwardKernel.setArg(2, bufferSize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)), NULL);
+            packBackwardKernel = cl::Kernel(program, "packBackwardData");
+            packBackwardKernel.setArg(2, bufferSize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2)), NULL);
+            unpackBackwardKernel = cl::Kernel(program, "unpackBackwardData");
+        }
+    }
+    bool inputIsReal = (realToComplex && !packRealAsComplex);
+    zkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, true, inputIsReal);
+    xkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, true, inputIsReal);
+    ykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, true, inputIsReal);
+    invzkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, false, inputIsReal);
+    invxkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, false, inputIsReal);
+    invykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, false, inputIsReal);
 }

 void OpenCLFFT3D::execFFT(OpenCLArray& in, OpenCLArray& out, bool forward) {
-    zkernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
-    zkernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
-    zkernel.setArg<cl_int>(2, forward ? 1 : -1);
-    context.executeKernel(zkernel, xsize*ysize*zsize, zthreads);
-    xkernel.setArg<cl::Buffer>(0, out.getDeviceBuffer());
-    xkernel.setArg<cl::Buffer>(1, in.getDeviceBuffer());
-    xkernel.setArg<cl_int>(2, forward ? 1 : -1);
-    context.executeKernel(xkernel, xsize*ysize*zsize, xthreads);
-    ykernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
-    ykernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
-    ykernel.setArg<cl_int>(2, forward ? 1 : -1);
-    context.executeKernel(ykernel, xsize*ysize*zsize, ythreads);
+    cl::Kernel kernel1 = (forward ? zkernel : invzkernel);
+    cl::Kernel kernel2 = (forward ? xkernel : invxkernel);
+    cl::Kernel kernel3 = (forward ? ykernel : invykernel);
+    if (packRealAsComplex) {
+        cl::Kernel packKernel = (forward ? packForwardKernel : packBackwardKernel);
+        cl::Kernel unpackKernel = (forward ? unpackForwardKernel : unpackBackwardKernel);
+        int gridSize = xsize*ysize*zsize/2;
+
+        // Pack the data into a half sized grid.
+        
+        packKernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
+        packKernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
+        context.executeKernel(packKernel, gridSize);
+        
+        // Perform the FFT.
+        
+        kernel1.setArg<cl::Buffer>(0, out.getDeviceBuffer());
+        kernel1.setArg<cl::Buffer>(1, in.getDeviceBuffer());
+        context.executeKernel(kernel1, gridSize, zthreads);
+        kernel2.setArg<cl::Buffer>(0, in.getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(1, out.getDeviceBuffer());
+        context.executeKernel(kernel2, gridSize, xthreads);
+        kernel3.setArg<cl::Buffer>(0, out.getDeviceBuffer());
+        kernel3.setArg<cl::Buffer>(1, in.getDeviceBuffer());
+        context.executeKernel(kernel3, gridSize, ythreads);
+        
+        // Unpack the data.
+        
+        unpackKernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
+        unpackKernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
+        context.executeKernel(unpackKernel, gridSize);
+    }
+    else {
+        kernel1.setArg<cl::Buffer>(0, in.getDeviceBuffer());
+        kernel1.setArg<cl::Buffer>(1, out.getDeviceBuffer());
+        context.executeKernel(kernel1, xsize*ysize*zsize, zthreads);
+        kernel2.setArg<cl::Buffer>(0, out.getDeviceBuffer());
+        kernel2.setArg<cl::Buffer>(1, in.getDeviceBuffer());
+        context.executeKernel(kernel2, xsize*ysize*zsize, xthreads);
+        kernel3.setArg<cl::Buffer>(0, in.getDeviceBuffer());
+        kernel3.setArg<cl::Buffer>(1, out.getDeviceBuffer());
+        context.executeKernel(kernel3, xsize*ysize*zsize, ythreads);
+    }
 }

 int OpenCLFFT3D::findLegalDimension(int minimum) {
@@ -73,8 +157,10 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
    }
 }

-cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads) {
+cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward, bool inputIsReal) {
    int maxThreads = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
+    while (maxThreads > 128 && maxThreads-64 >= zsize)
+        maxThreads -= 64;
    bool isCPU = context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU;
    while (true) {
        bool loopRequired = (zsize > maxThreads || isCPU);
@@ -137,10 +223,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
                source<<"real2 b2 = "<<context.doubleToString((2*cos(2*M_PI/7)-cos(4*M_PI/7)-cos(6*M_PI/7))/3)<<"*(d0-d4);\n";
                source<<"real2 b3 = "<<context.doubleToString((cos(2*M_PI/7)-2*cos(4*M_PI/7)+cos(6*M_PI/7))/3)<<"*(d4-d2);\n";
                source<<"real2 b4 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)-2*cos(6*M_PI/7))/3)<<"*(d2-d0);\n";
-                source<<"real2 b5 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n";
-                source<<"real2 b6 = -sign*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n";
-                source<<"real2 b7 = -sign*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n";
-                source<<"real2 b8 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n";
+                source<<"real2 b5 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n";
+                source<<"real2 b6 = -(SIGN)*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n";
+                source<<"real2 b7 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n";
+                source<<"real2 b8 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n";
                source<<"real2 t0 = b0+b1;\n";
                source<<"real2 t1 = b2+b3;\n";
                source<<"real2 t2 = b4-b3;\n";
@@ -178,8 +264,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
                source<<"real2 d7 = d6+d5;\n";
                source<<"real2 d8 = d6-d5;\n";
                string coeff = context.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
-                source<<"real2 d9 = sign*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
-                source<<"real2 d10 = sign*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
+                source<<"real2 d9 = (SIGN)*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
+                source<<"real2 d10 = (SIGN)*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
                source<<"data"<<output<<"[base+4*j*"<<m<<"] = c0+d4;\n";
                source<<"data"<<output<<"[base+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
                source<<"data"<<output<<"[base+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
@@ -194,7 +280,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
                source<<"real2 d0 = c0+c2;\n";
                source<<"real2 d1 = c0-c2;\n";
                source<<"real2 d2 = c1+c3;\n";
-                source<<"real2 d3 = sign*(real2) (c1.y-c3.y, c3.x-c1.x);\n";
+                source<<"real2 d3 = (SIGN)*(real2) (c1.y-c3.y, c3.x-c1.x);\n";
                source<<"data"<<output<<"[base+3*j*"<<m<<"] = d0+d2;\n";
                source<<"data"<<output<<"[base+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
                source<<"data"<<output<<"[base+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
@@ -206,7 +292,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
                source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
                source<<"real2 d0 = c1+c2;\n";
                source<<"real2 d1 = c0-0.5f*d0;\n";
-                source<<"real2 d2 = sign*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n";
+                source<<"real2 d2 = (SIGN)*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n";
                source<<"data"<<output<<"[base+2*j*"<<m<<"] = c0+d0;\n";
                source<<"data"<<output<<"[base+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
                source<<"data"<<output<<"[base+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
@@ -226,13 +312,27 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa

        // Create the kernel.

+        bool outputIsReal = (inputIsReal && axis == 2 && !forward);
+        bool outputIsPacked = (inputIsReal && axis == 2 && forward);
+        string outputSuffix = (outputIsReal ? ".x" : "");
        if (loopRequired) {
+            if (outputIsPacked)
+                source<<"if (x < XSIZE/2+1)\n";
            source<<"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))\n";
-            source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z];\n";
+            if (outputIsPacked)
+                source<<"out[y*(ZSIZE*(XSIZE/2+1))+z*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[z]"<<outputSuffix<<";\n";
+            else
+                source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z]"<<outputSuffix<<";\n";
        }
        else {
-            source<<"if (index < XSIZE*YSIZE)\n";
-            source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n";
+            if (outputIsPacked) {
+                source<<"if (index < XSIZE*YSIZE && x < XSIZE/2+1)\n";
+                source<<"out[y*(ZSIZE*(XSIZE/2+1))+(get_local_id(0)%ZSIZE)*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n";
+            }
+            else {
+                source<<"if (index < XSIZE*YSIZE)\n";
+                source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n";
+            }
        }
        map<string, string> replacements;
        replacements["XSIZE"] = context.intToString(xsize);
@@ -242,6 +342,12 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
        replacements["M_PI"] = context.doubleToString(M_PI);
        replacements["COMPUTE_FFT"] = source.str();
        replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0");
+        replacements["SIGN"] = (forward ? "1" : "-1");
+        replacements["INPUT_TYPE"] = (inputIsReal && axis == 0 && forward ? "real" : "real2");
+        replacements["OUTPUT_TYPE"] = (outputIsReal ? "real" : "real2");
+        replacements["INPUT_IS_REAL"] = (inputIsReal && axis == 0 && forward ? "1" : "0");
+        replacements["INPUT_IS_PACKED"] = (inputIsReal && axis == 0 && !forward ? "1" : "0");
+        replacements["OUTPUT_IS_PACKED"] = (outputIsPacked ? "1" : "0");
        cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements));
        cl::Kernel kernel(program, "execFFT");
        threads = (isCPU ? 1 : blocksPerGroup*zsize);
@@ -253,9 +359,9 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
            continue;
        }
        int bufferSize = blocksPerGroup*zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
+        kernel.setArg(2, bufferSize, NULL);
        kernel.setArg(3, bufferSize, NULL);
        kernel.setArg(4, bufferSize, NULL);
-        kernel.setArg(5, bufferSize, NULL);
        return kernel;
    }
 }
--- a/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2014 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -32,6 +32,7 @@
 #include "openmm/VirtualSite.h"
 #include "quern.h"
 #include "OpenCLExpressionUtilities.h"
+#include "ReferenceCCMAAlgorithm.h"
 #include <algorithm>
 #include <cmath>
 #include <cstdlib>
@@ -323,157 +324,54 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c

    int numCCMA = (int) ccmaConstraints.size();
    if (numCCMA > 0) {
-        vector<vector<int> > atomConstraints(context.getNumAtoms());
+        // Record information needed by ReferenceCCMAAlgorithm.
+        
+        vector<pair<int, int> > refIndices(numCCMA);
+        vector<RealOpenMM> refDistance(numCCMA);
        for (int i = 0; i < numCCMA; i++) {
-            atomConstraints[atom1[ccmaConstraints[i]]].push_back(i);
-            atomConstraints[atom2[ccmaConstraints[i]]].push_back(i);
-        }
-        vector<vector<int> > linkedConstraints(numCCMA);
-        for (unsigned atom = 0; atom < atomConstraints.size(); atom++) {
-            for (unsigned i = 0; i < atomConstraints[atom].size(); i++)
-                for (unsigned j = 0; j < i; j++) {
-                    int c1 = atomConstraints[atom][i];
-                    int c2 = atomConstraints[atom][j];
-                    linkedConstraints[c1].push_back(c2);
-                    linkedConstraints[c2].push_back(c1);
-                }
+            int index = ccmaConstraints[i];
+            refIndices[i] = make_pair(atom1[index], atom2[index]);
+            refDistance[i] = distance[index];
        }
-        int maxLinks = 0;
-        for (unsigned i = 0; i < linkedConstraints.size(); i++)
-            maxLinks = max(maxLinks, (int) linkedConstraints[i].size());
-        int maxAtomConstraints = 0;
-        for (unsigned i = 0; i < atomConstraints.size(); i++)
-            maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size());
-
-        // Compute the constraint coupling matrix
-
-        vector<vector<int> > atomAngles(numAtoms);
-        HarmonicAngleForce const* angleForce = NULL;
-        for (int i = 0; i < system.getNumForces() && angleForce == NULL; i++)
-            angleForce = dynamic_cast<HarmonicAngleForce const*>(&system.getForce(i));
-        if (angleForce != NULL)
-            for (int i = 0; i < angleForce->getNumAngles(); i++) {
-                int particle1, particle2, particle3;
-                double angle, k;
-                angleForce->getAngleParameters(i, particle1, particle2, particle3, angle, k);
-                atomAngles[particle2].push_back(i);
-            }
-        vector<vector<pair<int, double> > > matrix(numCCMA);
-        for (int j = 0; j < numCCMA; j++) {
-            for (int k = 0; k < numCCMA; k++) {
-                if (j == k) {
-                    matrix[j].push_back(pair<int, double>(j, 1.0));
-                    continue;
-                }
-                double scale;
-                int cj = ccmaConstraints[j];
-                int ck = ccmaConstraints[k];
-                int atomj0 = atom1[cj];
-                int atomj1 = atom2[cj];
-                int atomk0 = atom1[ck];
-                int atomk1 = atom2[ck];
-                int atoma, atomb, atomc;
-                double imj0 = 1.0/system.getParticleMass(atomj0);
-                double imj1 = 1.0/system.getParticleMass(atomj1);
-                if (atomj0 == atomk0) {
-                    atoma = atomj1;
-                    atomb = atomj0;
-                    atomc = atomk1;
-                    scale = imj0/(imj0+imj1);
-                }
-                else if (atomj1 == atomk1) {
-                    atoma = atomj0;
-                    atomb = atomj1;
-                    atomc = atomk0;
-                    scale = imj1/(imj0+imj1);
-                }
-                else if (atomj0 == atomk1) {
-                    atoma = atomj1;
-                    atomb = atomj0;
-                    atomc = atomk0;
-                    scale = imj0/(imj0+imj1);
-                }
-                else if (atomj1 == atomk0) {
-                    atoma = atomj0;
-                    atomb = atomj1;
-                    atomc = atomk1;
-                    scale = imj1/(imj0+imj1);
-                }
-                else
-                    continue; // These constraints are not connected.
-
-                // Look for a third constraint forming a triangle with these two.
+        vector<RealOpenMM> refMasses(numAtoms);
+        for (int i = 0; i < numAtoms; ++i)
+            refMasses[i] = (RealOpenMM) system.getParticleMass(i);

-                bool foundConstraint = false;
-                for (int m = 0; m < numCCMA; m++) {
-                    int other = ccmaConstraints[m];
-                    if ((atom1[other] == atoma && atom2[other] == atomc) || (atom1[other] == atomc && atom2[other] == atoma)) {
-                        double d1 = distance[cj];
-                        double d2 = distance[ck];
-                        double d3 = distance[other];
-                        matrix[j].push_back(pair<int, double>(k, scale*(d1*d1+d2*d2-d3*d3)/(2.0*d1*d2)));
-                        foundConstraint = true;
-                        break;
-                    }
-                }
-                if (!foundConstraint && angleForce != NULL) {
-                    // We didn't find one, so look for an angle force field term.
-
-                    const vector<int>& angleCandidates = atomAngles[atomb];
-                    for (vector<int>::const_iterator iter = angleCandidates.begin(); iter != angleCandidates.end(); iter++) {
-                        int particle1, particle2, particle3;
-                        double angle, ka;
-                        angleForce->getAngleParameters(*iter, particle1, particle2, particle3, angle, ka);
-                        if ((particle1 == atoma && particle3 == atomc) || (particle3 == atoma && particle1 == atomc)) {
-                            matrix[j].push_back(pair<int, double>(k, scale*cos(angle)));
-                            break;
-                        }
-                    }
+        // Look up angles for CCMA.
+        
+        vector<ReferenceCCMAAlgorithm::AngleInfo> angles;
+        for (int i = 0; i < system.getNumForces(); i++) {
+            const HarmonicAngleForce* force = dynamic_cast<const HarmonicAngleForce*>(&system.getForce(i));
+            if (force != NULL) {
+                for (int j = 0; j < force->getNumAngles(); j++) {
+                    int atom1, atom2, atom3;
+                    double angle, k;
+                    force->getAngleParameters(j, atom1, atom2, atom3, angle, k);
+                    angles.push_back(ReferenceCCMAAlgorithm::AngleInfo(atom1, atom2, atom3, (RealOpenMM) angle));
                }
            }
        }
-
-        // Invert it using QR.
-
-        vector<int> matrixRowStart;
-        vector<int> matrixColIndex;
-        vector<double> matrixValue;
-        for (int i = 0; i < numCCMA; i++) {
-            matrixRowStart.push_back(matrixValue.size());
-            for (int j = 0; j < (int) matrix[i].size(); j++) {
-                pair<int, double> element = matrix[i][j];
-                matrixColIndex.push_back(element.first);
-                matrixValue.push_back(element.second);
-            }
-        }
-        matrixRowStart.push_back(matrixValue.size());
-        int *qRowStart, *qColIndex, *rRowStart, *rColIndex;
-        double *qValue, *rValue;
-        int result = QUERN_compute_qr(numCCMA, numCCMA, &matrixRowStart[0], &matrixColIndex[0], &matrixValue[0], NULL,
-                &qRowStart, &qColIndex, &qValue, &rRowStart, &rColIndex, &rValue);
-        vector<double> rhs(numCCMA);
-        matrix.clear();
-        matrix.resize(numCCMA);
-        for (int i = 0; i < numCCMA; i++) {
-            // Extract column i of the inverse matrix.
-
-            for (int j = 0; j < numCCMA; j++)
-                rhs[j] = (i == j ? 1.0 : 0.0);
-            result = QUERN_multiply_with_q_transpose(numCCMA, qRowStart, qColIndex, qValue, &rhs[0]);
-            result = QUERN_solve_with_r(numCCMA, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]);
-            for (int j = 0; j < numCCMA; j++) {
-                double value = rhs[j]*distance[ccmaConstraints[i]]/distance[ccmaConstraints[j]];
-                if (abs(value) > 0.1)
-                    matrix[j].push_back(pair<int, double>(i, value));
-            }
-        }
-        QUERN_free_result(qRowStart, qColIndex, qValue);
-        QUERN_free_result(rRowStart, rColIndex, rValue);
+        
+        // Create a ReferenceCCMAAlgorithm.  It will build and invert the constraint matrix for us.
+        
+        ReferenceCCMAAlgorithm ccma(numAtoms, numCCMA, refIndices, refDistance, refMasses, angles, 0.1);
+        vector<vector<pair<int, double> > > matrix = ccma.getMatrix();
        int maxRowElements = 0;
        for (unsigned i = 0; i < matrix.size(); i++)
            maxRowElements = max(maxRowElements, (int) matrix[i].size());
        maxRowElements++;

+        // Build the list of constraints for each atom.
+
+        vector<vector<int> > atomConstraints(context.getNumAtoms());
+        for (int i = 0; i < numCCMA; i++) {
+            atomConstraints[atom1[ccmaConstraints[i]]].push_back(i);
+            atomConstraints[atom2[ccmaConstraints[i]]].push_back(i);
+        }
+        int maxAtomConstraints = 0;
+        for (unsigned i = 0; i < atomConstraints.size(); i++)
+            maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size());
+
        // Sort the constraints.

        vector<int> constraintOrder(numCCMA);

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
--- a/platforms/opencl/src/OpenCLParallelKernels.cpp
+++ b/platforms/opencl/src/OpenCLParallelKernels.cpp
@@ -492,6 +492,11 @@ double OpenCLParallelCalcCMAPTorsionForceKernel::execute(ContextImpl& context, b
    return 0.0;
 }

+void OpenCLParallelCalcCMAPTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force) {
+    for (int i = 0; i < (int) kernels.size(); i++)
+        getKernel(i).copyParametersToContext(context, force);
+}
+
 class OpenCLParallelCalcCustomTorsionForceKernel::Task : public OpenCLContext::WorkTask {
 public:
    Task(ContextImpl& context, OpenCLCalcCustomTorsionForceKernel& kernel, bool includeForce,

--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -109,7 +109,7 @@ bool OpenCLPlatform::supportsDoublePrecision() const {

 bool OpenCLPlatform::isPlatformSupported() {
    // Return false for OpenCL implementations that are known
-    // to be buggy (Apple OSX since 10.7.5)
+    // to be buggy (Apple OS X prior to 10.10).

 #ifdef __APPLE__
    char str[256];
@@ -122,12 +122,10 @@ bool OpenCLPlatform::isPlatformSupported() {
    if (sscanf(str, "%d.%d.%d", &major, &minor, &micro) != 3)
        return false;

-    if ((major > 11) || (major == 11 && minor > 4) || (major == 11 && minor == 4 && micro >= 2))
-        // 11.4.2 is the darwin release corresponding to OSX 10.7.5, which is the
-        // point at which a number of serious bugs were introduced into the
-        // Apple OpenCL libraries, resulting in catistrophically incorrect MD simulations
-        // (see https://github.com/SimTk/openmm/issues/395 for example). Once a fix is released,
-        // this version check should be updated.
+    if (major < 14 || (major == 14 && minor < 3))
+        // 14.3.0 is the darwin release corresponding to OS X 10.10.3. Versions prior to that
+        // contained a number of serious bugs in the Apple OpenCL libraries.
+        // (See https://github.com/SimTk/openmm/issues/395 for example.)
        return false;
 #endif


--- a/platforms/opencl/src/OpenCLSort.cpp
+++ b/platforms/opencl/src/OpenCLSort.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010-2013 Stanford University and the Authors.      *
+ * Portions copyright (c) 2010-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -42,7 +42,6 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
    replacements["MIN_KEY"] = trait->getMinKey();
    replacements["MAX_KEY"] = trait->getMaxKey();
    replacements["MAX_VALUE"] = trait->getMaxValue();
-    replacements["VALUE_IS_INT2"] = (trait->getDataType() == std::string("int2") ? "1" : "0");
    cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::sort, replacements));
    shortListKernel = cl::Kernel(program, "sortShortList");
    computeRangeKernel = cl::Kernel(program, "computeRange");
@@ -59,7 +58,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
    unsigned int maxRangeSize = std::min(maxGroupSize, (unsigned int) computeRangeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
    unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
    unsigned int maxShortListSize = shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
-    isShortList = (length <= maxLocalBuffer && length < maxShortListSize);
+    // On Qualcomm's OpenCL, it's essential to check against maxShortListSize.  Otherwise you get a crash.
+    // But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
+    // maximum, so including the check hurts performance.  For the moment I'm going to just comment it out.
+    // If we officially support Qualcomm in the future, we'll need to do something better.
+    isShortList = (length <= maxLocalBuffer/* && length < maxShortListSize*/);
    for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2)
        ;
    positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize);

--- a/platforms/opencl/src/kernels/customIntegratorGlobal.cl
+++ b/platforms/opencl/src/kernels/customIntegratorGlobal.cl
 __kernel void computeGlobal(__global mixed2* restrict dt, __global mixed* restrict globals, __global mixed* restrict params,
-        float uniform, float gaussian, __global const real* restrict energy) {
+        float uniform, float gaussian, const real energy) {
    COMPUTE_STEP
 }
--- a/platforms/opencl/src/kernels/customIntegratorPerDof.cl
+++ b/platforms/opencl/src/kernels/customIntegratorPerDof.cl
@@ -26,7 +26,7 @@ void storePos(__global real4* restrict posq, __global real4* restrict posqCorrec
 __kernel void computePerDof(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta,
        __global mixed4* restrict velm, __global const real4* restrict force, __global const mixed2* restrict dt, __global const mixed* restrict globals,
        __global const mixed* restrict params, __global mixed* restrict sum, __global const float4* restrict gaussianValues,
-        unsigned int gaussianBaseIndex, __global const float4* restrict uniformValues, __global const real* restrict energy
+        unsigned int gaussianBaseIndex, __global const float4* restrict uniformValues, const real energy
        PARAMETER_ARGUMENTS) {
    mixed stepSize = dt[0].y;
    int index = get_global_id(0);

--- a/platforms/opencl/src/kernels/customManyParticle.cl
+++ b/platforms/opencl/src/kernels/customManyParticle.cl
@@ -227,7 +227,9 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
                    int start = block2*TILE_SIZE;
                    int included[TILE_SIZE];
                    int numIncluded = 0;
+                    SYNC_WARPS;
                    positionCache[get_local_id(0)] = posq[start+indexInWarp];
+                    SYNC_WARPS;
                    if (atom1 < NUM_ATOMS) {
                        for (int j = 0; j < 32; j++) {
                            int atom2 = start+j;
@@ -287,7 +289,7 @@ __kernel void computeNeighborStartIndices(__global int* restrict numNeighborsFor

        unsigned int globalIndex = startAtom+get_local_id(0);
        posBuffer[get_local_id(0)] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0);
-    barrier(CLK_LOCAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);

        // Perform a parallel prefix sum.


--- a/platforms/opencl/src/kernels/fft.cl
+++ b/platforms/opencl/src/kernels/fft.cl
@@ -2,26 +2,57 @@ real2 multiplyComplex(real2 c1, real2 c2) {
    return (real2) (c1.x*c2.x-c1.y*c2.y, c1.x*c2.y+c1.y*c2.x);
 }

+/**
+ * Load a value from the half-complex grid produces by a real-to-complex transform.
+ */
+real2 loadComplexValue(__global const real2* restrict in, int x, int y, int z) {
+    const int inputZSize = ZSIZE/2+1;
+    if (z < inputZSize)
+        return in[x*YSIZE*inputZSize+y*inputZSize+z];
+    int xp = (x == 0 ? 0 : XSIZE-x);
+    int yp = (y == 0 ? 0 : YSIZE-y);
+    real2 value = in[xp*YSIZE*inputZSize+yp*inputZSize+(ZSIZE-z)];
+    return (real2) (value.x, -value.y);
+}
+
 /**
 * Perform a 1D FFT on each row along one axis.
 */

-__kernel void execFFT(__global const real2* restrict in, __global real2* restrict out, int sign, __local real2* restrict w,
+__kernel void execFFT(__global const INPUT_TYPE* restrict in, __global OUTPUT_TYPE* restrict out, __local real2* restrict w,
        __local real2* restrict data0, __local real2* restrict data1) {
    for (int i = get_local_id(0); i < ZSIZE; i += get_local_size(0))
-        w[i] = (real2) (cos(-sign*i*2*M_PI/ZSIZE), sin(-sign*i*2*M_PI/ZSIZE));
+        w[i] = (real2) (cos(-(SIGN)*i*2*M_PI/ZSIZE), sin(-(SIGN)*i*2*M_PI/ZSIZE));
    barrier(CLK_LOCAL_MEM_FENCE);
    
    for (int baseIndex = get_group_id(0)*BLOCKS_PER_GROUP; baseIndex < XSIZE*YSIZE; baseIndex += get_num_groups(0)*BLOCKS_PER_GROUP) {
        int index = baseIndex+get_local_id(0)/ZSIZE;
        int x = index/YSIZE;
        int y = index-x*YSIZE;
+#if OUTPUT_IS_PACKED
+        if (x < XSIZE/2+1) {
+#endif
 #if LOOP_REQUIRED
        for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))
+    #if INPUT_IS_REAL
+            data0[z] = (real2) (in[x*(YSIZE*ZSIZE)+y*ZSIZE+z], 0);
+    #elif INPUT_IS_PACKED
+            data0[z] = loadComplexValue(in, x, y, z);
+    #else
            data0[z] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+z];
+    #endif
 #else
        if (index < XSIZE*YSIZE)
+    #if INPUT_IS_REAL
+            data0[get_local_id(0)] = (real2) (in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE], 0);
+    #elif INPUT_IS_PACKED
+            data0[get_local_id(0)] = loadComplexValue(in, x, y, get_local_id(0)%ZSIZE);
+    #else
            data0[get_local_id(0)] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE];
+    #endif
+#endif
+#if OUTPUT_IS_PACKED
+        }
 #endif
        barrier(CLK_LOCAL_MEM_FENCE);
        COMPUTE_FFT

--- a/platforms/opencl/src/kernels/fftR2C.cl
+++ b/platforms/opencl/src/kernels/fftR2C.cl
+/**
+ * Combine the two halves of a real grid into a complex grid that is half as large.
+ */
+__kernel void packForwardData(__global const real* restrict in, __global real2* restrict out) {
+    const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
+    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
+        int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
+        int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
+        int y = remainder/PACKED_ZSIZE;
+        int z = remainder-y*PACKED_ZSIZE;
+#if PACKED_AXIS == 0
+        real2 value = (real2) (in[2*x*YSIZE*ZSIZE+y*ZSIZE+z], in[(2*x+1)*YSIZE*ZSIZE+y*ZSIZE+z]);
+#elif PACKED_AXIS == 1
+        real2 value = (real2) (in[x*YSIZE*ZSIZE+2*y*ZSIZE+z], in[x*YSIZE*ZSIZE+(2*y+1)*ZSIZE+z]);
+#else
+        real2 value = (real2) (in[x*YSIZE*ZSIZE+y*ZSIZE+2*z], in[x*YSIZE*ZSIZE+y*ZSIZE+(2*z+1)]);
+#endif
+        out[index] = value;
+    }
+}
+
+/**
+ * Split the transformed data back into a full sized, symmetric grid.
+ */
+__kernel void unpackForwardData(__global const real2* restrict in, __global real2* restrict out, __local real2* restrict w) {
+    // Compute the phase factors.
+    
+#if PACKED_AXIS == 0
+    for (int i = get_local_id(0); i < PACKED_XSIZE; i += get_local_size(0))
+        w[i] = (real2) (sin(i*2*M_PI/XSIZE), cos(i*2*M_PI/XSIZE));
+#elif PACKED_AXIS == 1
+    for (int i = get_local_id(0); i < PACKED_YSIZE; i += get_local_size(0))
+        w[i] = (real2) (sin(i*2*M_PI/YSIZE), cos(i*2*M_PI/YSIZE));
+#else
+    for (int i = get_local_id(0); i < PACKED_ZSIZE; i += get_local_size(0))
+        w[i] = (real2) (sin(i*2*M_PI/ZSIZE), cos(i*2*M_PI/ZSIZE));
+#endif
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Transform the data.
+    
+    const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
+    const int outputZSize = ZSIZE/2+1;
+    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
+        int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
+        int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
+        int y = remainder/PACKED_ZSIZE;
+        int z = remainder-y*PACKED_ZSIZE;
+        int xp = (x == 0 ? 0 : PACKED_XSIZE-x);
+        int yp = (y == 0 ? 0 : PACKED_YSIZE-y);
+        int zp = (z == 0 ? 0 : PACKED_ZSIZE-z);
+        real2 z1 = in[x*PACKED_YSIZE*PACKED_ZSIZE+y*PACKED_ZSIZE+z];
+        real2 z2 = in[xp*PACKED_YSIZE*PACKED_ZSIZE+yp*PACKED_ZSIZE+zp];
+#if PACKED_AXIS == 0
+        real2 wfac = w[x];
+#elif PACKED_AXIS == 1
+        real2 wfac = w[y];
+#else
+        real2 wfac = w[z];
+#endif
+        real2 output = (real2) ((z1.x+z2.x - wfac.x*(z1.x-z2.x) + wfac.y*(z1.y+z2.y))/2, (z1.y-z2.y - wfac.y*(z1.x-z2.x) - wfac.x*(z1.y+z2.y))/2);
+        if (z < outputZSize)
+            out[x*YSIZE*outputZSize+y*outputZSize+z] = output;
+        xp = (x == 0 ? 0 : XSIZE-x);
+        yp = (y == 0 ? 0 : YSIZE-y);
+        zp = (z == 0 ? 0 : ZSIZE-z);
+        if (zp < outputZSize) {
+#if PACKED_AXIS == 0
+            if (x == 0)
+                out[PACKED_XSIZE*YSIZE*outputZSize+yp*outputZSize+zp] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
+#elif PACKED_AXIS == 1
+            if (y == 0)
+                out[xp*YSIZE*outputZSize+PACKED_YSIZE*outputZSize+zp] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
+#else
+            if (z == 0)
+                out[xp*YSIZE*outputZSize+yp*outputZSize+PACKED_ZSIZE] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
+#endif
+            else
+                out[xp*YSIZE*outputZSize+yp*outputZSize+zp] = (real2) (output.x, -output.y);
+        }
+    }
+}
+
+/**
+ * Load a value from the half-complex grid produced by a real-to-complex transform.
+ */
+real2 loadComplexValue(__global const real2* restrict in, int x, int y, int z) {
+    const int inputZSize = ZSIZE/2+1;
+    if (z < inputZSize)
+        return in[x*YSIZE*inputZSize+y*inputZSize+z];
+    int xp = (x == 0 ? 0 : XSIZE-x);
+    int yp = (y == 0 ? 0 : YSIZE-y);
+    real2 value = in[xp*YSIZE*inputZSize+yp*inputZSize+(ZSIZE-z)];
+    return (real2) (value.x, -value.y);
+}
+
+/**
+ * Repack the symmetric complex grid into one half as large in preparation for doing an inverse complex-to-real transform.
+ */
+__kernel void packBackwardData(__global const real2* restrict in, __global real2* restrict out, __local real2* restrict w) {
+    // Compute the phase factors.
+    
+#if PACKED_AXIS == 0
+    for (int i = get_local_id(0); i < PACKED_XSIZE; i += get_local_size(0))
+        w[i] = (real2) (cos(i*2*M_PI/XSIZE), sin(i*2*M_PI/XSIZE));
+#elif PACKED_AXIS == 1
+    for (int i = get_local_id(0); i < PACKED_YSIZE; i += get_local_size(0))
+        w[i] = (real2) (cos(i*2*M_PI/YSIZE), sin(i*2*M_PI/YSIZE));
+#else
+    for (int i = get_local_id(0); i < PACKED_ZSIZE; i += get_local_size(0))
+        w[i] = (real2) (cos(i*2*M_PI/ZSIZE), sin(i*2*M_PI/ZSIZE));
+#endif
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Transform the data.
+    
+    const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
+    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
+        int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
+        int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
+        int y = remainder/PACKED_ZSIZE;
+        int z = remainder-y*PACKED_ZSIZE;
+        int xp = (x == 0 ? 0 : PACKED_XSIZE-x);
+        int yp = (y == 0 ? 0 : PACKED_YSIZE-y);
+        int zp = (z == 0 ? 0 : PACKED_ZSIZE-z);
+        real2 z1 = loadComplexValue(in, x, y, z);
+#if PACKED_AXIS == 0
+        real2 wfac = w[x];
+        real2 z2 = loadComplexValue(in, PACKED_XSIZE-x, yp, zp);
+#elif PACKED_AXIS == 1
+        real2 wfac = w[y];
+        real2 z2 = loadComplexValue(in, xp, PACKED_YSIZE-y, zp);
+#else
+        real2 wfac = w[z];
+        real2 z2 = loadComplexValue(in, xp, yp, PACKED_ZSIZE-z);
+#endif
+        real2 even = (real2) ((z1.x+z2.x)/2, (z1.y-z2.y)/2);
+        real2 odd = (real2) ((z1.x-z2.x)/2, (z1.y+z2.y)/2);
+        odd = (real2) (odd.x*wfac.x-odd.y*wfac.y, odd.y*wfac.x+odd.x*wfac.y);
+        out[x*PACKED_YSIZE*PACKED_ZSIZE+y*PACKED_ZSIZE+z] = (real2) (even.x-odd.y, even.y+odd.x);
+    }
+}
+
+/**
+ * Split the data back into a full sized, real grid after an inverse transform.
+ */
+__kernel void unpackBackwardData(__global const real2* restrict in, __global real* restrict out) {
+    const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
+    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
+        int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
+        int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
+        int y = remainder/PACKED_ZSIZE;
+        int z = remainder-y*PACKED_ZSIZE;
+        real2 value = 2*in[index];
+#if PACKED_AXIS == 0
+        out[2*x*YSIZE*ZSIZE+y*ZSIZE+z] = value.x;
+        out[(2*x+1)*YSIZE*ZSIZE+y*ZSIZE+z] = value.y;
+#elif PACKED_AXIS == 1
+        out[x*YSIZE*ZSIZE+2*y*ZSIZE+z] = value.x;
+        out[x*YSIZE*ZSIZE+(2*y+1)*ZSIZE+z] = value.y;
+#else
+        out[x*YSIZE*ZSIZE+y*ZSIZE+2*z] = value.x;
+        out[x*YSIZE*ZSIZE+y*ZSIZE+(2*z+1)] = value.y;
+#endif
+    }
+}
--- a/platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
+++ b/platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
@@ -5,16 +5,15 @@
 /**
 * Find a bounding box for the atoms in each block.
 */
-__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict posq,
-        __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict rebuildNeighborList,
+__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        __global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict rebuildNeighborList,
        __global real2* restrict sortedBlocks) {
    int index = get_global_id(0);
    int base = index*TILE_SIZE;
    while (base < numAtoms) {
        real4 pos = posq[base];
 #ifdef USE_PERIODIC
-        pos.xyz -= floor(pos.xyz*invPeriodicBoxSize.xyz)*periodicBoxSize.xyz;
-        real4 firstPoint = pos;
+        APPLY_PERIODIC_TO_POS(pos)
 #endif
        real4 minPos = pos;
        real4 maxPos = pos;
@@ -22,7 +21,8 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
        for (int i = base+1; i < last; i++) {
            pos = posq[i];
 #ifdef USE_PERIODIC
-            pos.xyz -= floor((pos.xyz-firstPoint.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+            real4 center = 0.5f*(maxPos+minPos);
+            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
 #endif
            minPos = min(minPos, pos);
            maxPos = max(maxPos, pos);
@@ -44,7 +44,7 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
 __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global const real4* restrict blockCenter,
        __global const real4* restrict blockBoundingBox, __global real4* restrict sortedBlockCenter,
        __global real4* restrict sortedBlockBoundingBox, __global const real4* restrict posq, __global const real4* restrict oldPositions,
-        __global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList) {
+        __global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList, int forceRebuild) {
    for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
        int index = (int) sortedBlock[i].y;
        sortedBlockCenter[i] = blockCenter[index];
@@ -53,7 +53,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
    
    // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.

-    bool rebuild = false;
+    bool rebuild = forceRebuild;
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
        real4 delta = oldPositions[i]-posq[i];
        if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
@@ -70,8 +70,8 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
 * to global memory.
 */
 void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms, int* numAtoms, int numValid, __global unsigned int* interactionCount,
-            __global int* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
-            __global real4* posq, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
+            __global int* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX,
+            real4 periodicBoxVecY, real4 periodicBoxVecZ, __global const real4* posq, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
    real4 posBuffer[TILE_SIZE];
    const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
                                     0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
@@ -83,7 +83,7 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms,
            // The box is small enough that we can just translate all the atoms into a single periodic
            // box, then skip having to apply periodic boundary conditions later.
            
-            pos.xyz -= floor((pos.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, blockCenterX)
        }
 #endif
        posBuffer[i] = pos;
@@ -99,14 +99,14 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms,
            real4 pos = posq[atom];
 #ifdef USE_PERIODIC
            if (singlePeriodicCopy)
-                pos.xyz -= floor((pos.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+		APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, blockCenterX)
 #endif
            bool interacts = false;
 #ifdef USE_PERIODIC
            if (!singlePeriodicCopy) {
                for (int j = 0; j < TILE_SIZE && !interacts; j++) {
                    real4 delta = pos-posBuffer[j];
-                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                    APPLY_PERIODIC_TO_DELTA(delta)
                    interacts = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
                }
            }
@@ -159,9 +159,10 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms,
 * Compare the bounding boxes for each pair of blocks.  If they are sufficiently far apart,
 * mark them as non-interacting.
 */
-__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, __global unsigned int* restrict interactionCount,
-        __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex,
-        unsigned int numBlocks, __global real2* restrict sortedBlocks, __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
+__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
+        __global unsigned int* restrict interactionCount, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms,
+        __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global real2* restrict sortedBlocks,
+        __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
        __global const int* restrict rebuildNeighborList) {
    if (rebuildNeighborList[0] == 0)
@@ -204,9 +205,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
            real4 blockSizeY = sortedBlockBoundingBox[j];
            real4 delta = blockCenterX-blockCenterY;
 #ifdef USE_PERIODIC
-            delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-            delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-            delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+            APPLY_PERIODIC_TO_DELTA(delta)
 #endif
            delta.x = max((real) 0, fabs(delta.x)-blockSizeX.x-blockSizeY.x);
            delta.y = max((real) 0, fabs(delta.y)-blockSizeX.y-blockSizeY.y);
@@ -216,12 +215,12 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi

                buffer[valuesInBuffer++] = y;
                if (valuesInBuffer == BUFFER_SIZE) {
-                    storeInteractionData(x, buffer, atoms, &numAtoms, valuesInBuffer, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, blockCenterX, blockSizeX, maxTiles, false);
+                    storeInteractionData(x, buffer, atoms, &numAtoms, valuesInBuffer, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, posq, blockCenterX, blockSizeX, maxTiles, false);
                    valuesInBuffer = 0;
                }
            }
        }
-        storeInteractionData(x, buffer, atoms, &numAtoms, valuesInBuffer, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, blockCenterX, blockSizeX, maxTiles, true);
+        storeInteractionData(x, buffer, atoms, &numAtoms, valuesInBuffer, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, posq, blockCenterX, blockSizeX, maxTiles, true);
    }
    
    // Record the positions the neighbor list is based on.

--- a/platforms/opencl/src/kernels/gbsaObc_cpu.cl
+++ b/platforms/opencl/src/kernels/gbsaObc_cpu.cl
@@ -20,8 +20,9 @@ __kernel void computeBornSum(
 #endif
        __global const real4* restrict posq, __global const float2* restrict global_params,
 #ifdef USE_CUTOFF
-        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
-        unsigned int maxTiles, __global const real4* restrict blockCenter, __global const real4* restrict blockSize, __global const int* restrict interactingAtoms,
+        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
+        __global const real4* restrict blockSize, __global const int* restrict interactingAtoms,
 #else
        unsigned int numTiles,
 #endif
@@ -62,7 +63,7 @@ __kernel void computeBornSum(
                    real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
-                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
                    real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF
@@ -111,7 +112,7 @@ __kernel void computeBornSum(
                    real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
-                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef USE_CUTOFF
@@ -253,14 +254,13 @@ __kernel void computeBornSum(

                real4 blockCenterX = blockCenter[x];
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
-                    localData[tgx].x -= floor((localData[tgx].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                    localData[tgx].y -= floor((localData[tgx].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                    localData[tgx].z -= floor((localData[tgx].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                    APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[tgx], blockCenterX)
                }
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
                    real bornSum = 0;
                    real4 posq1 = posq[atom1];
+                    APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
                    float2 params1 = global_params[atom1];
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
@@ -321,7 +321,7 @@ __kernel void computeBornSum(
                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
-                        delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                        APPLY_PERIODIC_TO_DELTA(delta)
 #endif
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        int atom2 = atomIndices[j];
@@ -411,8 +411,9 @@ __kernel void computeGBSAForce1(
 #endif
        __global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
 #ifdef USE_CUTOFF
-        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
-        unsigned int maxTiles, __global const real4* restrict blockCenter, __global const real4* restrict blockSize, __global const int* restrict interactingAtoms,
+        __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
+        __global const real4* restrict blockSize, __global const int* restrict interactingAtoms,
 #else
        unsigned int numTiles,
 #endif
@@ -452,7 +453,7 @@ __kernel void computeGBSAForce1(
                    real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
-                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef USE_CUTOFF
@@ -516,7 +517,7 @@ __kernel void computeGBSAForce1(
                    real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
-                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
 #ifdef USE_CUTOFF
@@ -669,15 +670,13 @@ __kernel void computeGBSAForce1(

                real4 blockCenterX = blockCenter[x];
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
-                    localData[tgx].x -= floor((localData[tgx].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                    localData[tgx].y -= floor((localData[tgx].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                    localData[tgx].z -= floor((localData[tgx].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                    APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[tgx], blockCenterX)
                }
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
                    real4 force = 0;
                    real4 posq1 = posq[atom1];
-                    posq1.xyz -= floor((posq1.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                    APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
                    float bornRadius1 = global_bornRadii[atom1];
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
@@ -740,7 +739,7 @@ __kernel void computeGBSAForce1(
                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
-                        delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                        APPLY_PERIODIC_TO_DELTA(delta)
 #endif
                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                        int atom2 = atomIndices[j];

--- a/platforms/opencl/src/kernels/nonbonded_cpu.cl
+++ b/platforms/opencl/src/kernels/nonbonded_cpu.cl
@@ -23,7 +23,8 @@ __kernel void computeNonbonded(
        __global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
 #ifdef USE_CUTOFF
        , __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
-        unsigned int maxTiles, __global const real4* restrict blockCenter, __global const real4* restrict blockSize, __global const int* restrict interactingAtoms
+        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
+        __global const real4* restrict blockSize, __global const int* restrict interactingAtoms
 #endif
        PARAMETER_ARGUMENTS) {
    real energy = 0;
@@ -65,7 +66,7 @@ __kernel void computeNonbonded(
                    real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
-                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
                    real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF
@@ -133,7 +134,7 @@ __kernel void computeNonbonded(
                    real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
                    real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
-                    delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                    APPLY_PERIODIC_TO_DELTA(delta)
 #endif
                    real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF
@@ -291,15 +292,13 @@ __kernel void computeNonbonded(

                real4 blockCenterX = blockCenter[x];
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
-                    localData[tgx].x -= floor((localData[tgx].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
-                    localData[tgx].y -= floor((localData[tgx].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
-                    localData[tgx].z -= floor((localData[tgx].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+                    APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[tgx], blockCenterX)
                }
                for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
                    unsigned int atom1 = x*TILE_SIZE+tgx;
                    real4 force = 0;
                    real4 posq1 = posq[atom1];
-                    posq1.xyz -= floor((posq1.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                    APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
                    LOAD_ATOM1_PARAMETERS
                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
@@ -364,7 +363,7 @@ __kernel void computeNonbonded(
                        real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
                        real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
 #ifdef USE_PERIODIC
-                        delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
+                        APPLY_PERIODIC_TO_DELTA(delta)
 #endif
                        real r2 = dot(delta.xyz, delta.xyz);
 #ifdef USE_CUTOFF

--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
@@ -138,35 +138,34 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
                    real add = pos.w*data[ix].x*data[iy].y*data[iz].z;
-#ifdef USE_DOUBLE_PRECISION
-                    atom_add(&pmeGrid[2*index], (long) (add*0x100000000));
+#ifdef USE_ALTERNATE_MEMORY_ACCESS_PATTERN
+                    // On Nvidia devices (at least Maxwell anyway), this split ordering produces much higher performance.  Why?
+                    // I have no idea!  And of course on AMD it produces slower performance.  GPUs are not meant to be understood.
+                    atom_add(&pmeGrid[index%2 == 0 ? index/2 : (index+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2], (long) (add*0x100000000));
 #else
                    atom_add(&pmeGrid[index], (long) (add*0x100000000));
 #endif
-
                }
            }
        }
    }
 }

-__kernel void finishSpreadCharge(__global long* restrict pmeGrid) {
-    __global real2* realGrid = (__global real2*) pmeGrid;
+__kernel void finishSpreadCharge(__global long* restrict fixedGrid, __global real* restrict realGrid) {
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    real scale = EPSILON_FACTOR/(real) 0x100000000;
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
-#ifdef USE_DOUBLE_PRECISION
-        long value = pmeGrid[2*index];
+#ifdef USE_ALTERNATE_MEMORY_ACCESS_PATTERN
+        long value = fixedGrid[index%2 == 0 ? index/2 : (index+gridSize)/2];
 #else
-        long value = pmeGrid[index];
+        long value = fixedGrid[index];
 #endif
-        real2 realValue = (real2) ((real) (value*scale), 0);
-        realGrid[index] = realValue;
+        realGrid[index] = (real) (value*scale);
    }
 }
 #elif defined(DEVICE_IS_CPU)
 __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
-        __global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
+        __global real* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
    const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0);
    const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0);
    if (firstx == lastx)
@@ -230,7 +229,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
-                    pmeGrid[index].x += EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z;
+                    pmeGrid[index] += EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z;
                }
            }
        }
@@ -238,7 +237,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
 }
 #else
 __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
-        __global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
+        __global real* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
    unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
        // Compute the charge on a grid point.
@@ -290,22 +289,23 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
                }
            }
        }
-        pmeGrid[gridIndex] = (real2) (result*EPSILON_FACTOR, 0);
+        pmeGrid[gridIndex] = result*EPSILON_FACTOR;
    }
 }
 #endif

-__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global real* restrict energyBuffer, __global const real* restrict pmeBsplineModuliX,
-        __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, real recipScaleFactor) {
-    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
-    real energy = 0.0f;
+__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global const real* restrict pmeBsplineModuliX,
+        __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
+    // R2C stores into a half complex matrix where the last dimension is cut by half
+    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
+    const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
+
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
-        int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z);
-        int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z;
-        int ky = remainder/GRID_SIZE_Z;
-        int kz = remainder-ky*GRID_SIZE_Z;
-        if (kx == 0 && ky == 0 && kz == 0)
-            continue;
+        // real indices
+        int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
+        int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
+        int ky = remainder/(GRID_SIZE_Z/2+1);
+        int kz = remainder-ky*(GRID_SIZE_Z/2+1);
        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
@@ -319,13 +319,53 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global r
        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
-        pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
-        energy += eterm*(grid.x*grid.x + grid.y*grid.y);
+        if (kx != 0 || ky != 0 || kz != 0) {
+            pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
+        }
+    }
+}
+
+__kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global real* restrict energyBuffer,
+                      __global const real* restrict pmeBsplineModuliX, __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ,
+                      real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
+    // R2C stores into a half complex matrix where the last dimension is cut by half
+    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
+    const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
+ 
+    real energy = 0;
+    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
+        // real indices
+        int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));
+        int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z);
+        int ky = remainder/(GRID_SIZE_Z);
+        int kz = remainder-ky*(GRID_SIZE_Z);
+        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
+        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
+        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
+        real mhx = mx*recipBoxVecX.x;
+        real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
+        real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
+        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
+        real bx = pmeBsplineModuliX[kx];
+        real by = pmeBsplineModuliY[ky];
+        real bz = pmeBsplineModuliZ[kz];
+        real denom = m2*bx*by*bz;
+        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
+        if (kz >= (GRID_SIZE_Z/2+1)) {
+            kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
+            ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
+            kz = GRID_SIZE_Z-kz;
+        } 
+        int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
+        real2 grid = pmeGrid[indexInHalfComplexGrid];
+        if (kx != 0 || ky != 0 || kz != 0) {
+            energy += eterm*(grid.x*grid.x + grid.y*grid.y);
+        }
    }
    energyBuffer[get_global_id(0)] += 0.5f*energy;
 }

-__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real2* restrict pmeGrid,
+__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real* restrict pmeGrid,
        real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, __global int2* restrict pmeAtomGridIndex) {
    const real4 scale = 1/(real) (PME_ORDER-1);
    real4 data[PME_ORDER];
@@ -385,7 +425,7 @@ __kernel void gridInterpolateForce(__global const real4* restrict posq, __global
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
-                    real gridvalue = pmeGrid[index].x;
+                    real gridvalue = pmeGrid[index];
                    force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue;
                    force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue;
                    force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;

--- a/platforms/opencl/src/kernels/sort.cl
+++ b/platforms/opencl/src/kernels/sort.cl
@@ -109,13 +109,7 @@ __kernel void assignElementsToBuckets(__global const DATA_TYPE* restrict data, u
    float maxValue = (float) (range[1]);
    float bucketWidth = (maxValue-minValue)/numBuckets;
    for (uint index = get_global_id(0); index < length; index += get_global_size(0)) {
-#if defined(MAC_AMD_WORKAROUND) && VALUE_IS_INT2
-        __global int* d = (__global int*) data;
-        int2 element = (int2) (d[2*index], d[2*index+1]);
-        float key = (float) getValue(element);
-#else
        float key = (float) getValue(data[index]);
-#endif
        uint bucketIndex = min((uint) ((key-minValue)/bucketWidth), numBuckets-1);
        offsetInBucket[index] = atom_inc(&bucketOffset[bucketIndex]);
        bucketOfElement[index] = bucketIndex;

--- a/platforms/opencl/tests/TestOpenCLCMAPTorsionForce.cpp
+++ b/platforms/opencl/tests/TestOpenCLCMAPTorsionForce.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2010 Stanford University and the Authors.           *
+ * Portions copyright (c) 2010-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -105,11 +105,68 @@ void testCMAPTorsions() {
    }
 }

+void testChangingParameters() {
+    // Create a system with two maps and one torsion.
+
+    const int mapSize = 8;
+    System system;
+    for (int i = 0; i < 5; i++)
+        system.addParticle(1.0);
+    CMAPTorsionForce* cmap = new CMAPTorsionForce();
+    vector<double> mapEnergy1(mapSize*mapSize);
+    vector<double> mapEnergy2(mapSize*mapSize);
+    for (int i = 0; i < mapSize; i++) {
+        double angle1 = i*2*M_PI/mapSize;
+        double energy1 = cos(angle1);
+        for (int j = 0; j < mapSize; j++) {
+            double angle2 = j*2*M_PI/mapSize;
+            double energy2 = 10*sin(angle2);
+            mapEnergy1[i+j*mapSize] = energy1+energy2;
+            mapEnergy2[i+j*mapSize] = energy1-energy2;
+        }
+    }
+    cmap->addMap(mapSize, mapEnergy1);
+    cmap->addMap(mapSize, mapEnergy2);
+    cmap->addTorsion(0, 0, 1, 2, 3, 1, 2, 3, 4);
+    system.addForce(cmap);
+
+    // Set particle positions so angle1=0 and angle2=PI/4.
+
+    vector<Vec3> positions(5);
+    positions[0] = Vec3(0, 0, 1);
+    positions[1] = Vec3(0, 0, 0);
+    positions[2] = Vec3(1, 0, 0);
+    positions[3] = Vec3(1, 0, 1);
+    positions[4] = Vec3(0.5, -0.5, 1);
+    VerletIntegrator integrator(0.01);
+    Context context(system, integrator, platform);
+    context.setPositions(positions);
+
+    // Check that the energy is correct.
+
+    double energy = context.getState(State::Energy).getPotentialEnergy();
+    ASSERT_EQUAL_TOL(1+10*sin(M_PI/4), energy, 1e-5);
+
+    // Modify the parameters.
+
+    cmap->setTorsionParameters(0, 1, 0, 1, 2, 3, 1, 2, 3, 4);
+    for (int i = 0; i < mapSize*mapSize; i++)
+        mapEnergy2[i] *= 2.0;
+    cmap->setMapParameters(1, mapSize, mapEnergy2);
+    cmap->updateParametersInContext(context);
+
+    // See if the results are correct.
+
+    energy = context.getState(State::Energy).getPotentialEnergy();
+    ASSERT_EQUAL_TOL(2-20*sin(M_PI/4), energy, 1e-5);
+}
+
 int main(int argc, char* argv[]) {
    try {
        if (argc > 1)
            platform.setPropertyDefaultValue("OpenCLPrecision", string(argv[1]));
        testCMAPTorsions();
+        testChangingParameters();
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;

--- a/platforms/opencl/tests/TestOpenCLFFT.cpp
+++ b/platforms/opencl/tests/TestOpenCLFFT.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011 Stanford University and the Authors.           *
+ * Portions copyright (c) 2011-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -51,7 +51,7 @@ using namespace std;
 static OpenCLPlatform platform;

 template <class Real2>
-void testTransform() {
+void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
    System system;
    system.addParticle(0.0);
    OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false");
@@ -59,7 +59,6 @@ void testTransform() {
    context.initialize();
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);
-    int xsize = 28, ysize = 25, zsize = 30;
    vector<Real2> original(xsize*ysize*zsize);
    vector<t_complex> reference(original.size());
    for (int i = 0; i < (int) original.size(); i++) {
@@ -67,10 +66,16 @@ void testTransform() {
        original[i] = value;
        reference[i] = t_complex(value.x, value.y);
    }
+    for (int i = 0; i < (int) reference.size(); i++) {
+        if (realToComplex)
+            reference[i] = t_complex(i%2 == 0 ? original[i/2].x : original[i/2].y, 0);
+        else
+            reference[i] = t_complex(original[i].x, original[i].y);
+    }
    OpenCLArray grid1(context, original.size(), sizeof(Real2), "grid1");
    OpenCLArray grid2(context, original.size(), sizeof(Real2), "grid2");
    grid1.upload(original);
-    OpenCLFFT3D fft(context, xsize, ysize, zsize);
+    OpenCLFFT3D fft(context, xsize, ysize, zsize, realToComplex);

    // Perform a forward FFT, then verify the result is correct.

@@ -80,10 +85,15 @@ void testTransform() {
    fftpack_t plan;
    fftpack_init_3d(&plan, xsize, ysize, zsize);
    fftpack_exec_3d(plan, FFTPACK_FORWARD, &reference[0], &reference[0]);
-    for (int i = 0; i < (int) result.size(); ++i) {
-        ASSERT_EQUAL_TOL(reference[i].re, result[i].x, 1e-3);
-        ASSERT_EQUAL_TOL(reference[i].im, result[i].y, 1e-3);
-    }
+    int outputZSize = (realToComplex ? zsize/2+1 : zsize);
+    for (int x = 0; x < xsize; x++)
+        for (int y = 0; y < ysize; y++)
+            for (int z = 0; z < outputZSize; z++) {
+                int index1 = x*ysize*zsize + y*zsize + z;
+                int index2 = x*ysize*outputZSize + y*outputZSize + z;
+                ASSERT_EQUAL_TOL(reference[index1].re, result[index2].x, 1e-3);
+                ASSERT_EQUAL_TOL(reference[index1].im, result[index2].y, 1e-3);
+            }
    fftpack_destroy(plan);

    // Perform a backward transform and see if we get the original values.
@@ -91,7 +101,8 @@ void testTransform() {
    fft.execFFT(grid2, grid1, false);
    grid1.download(result);
    double scale = 1.0/(xsize*ysize*zsize);
-    for (int i = 0; i < (int) result.size(); ++i) {
+    int valuesToCheck = (realToComplex ? original.size()/2 : original.size());
+    for (int i = 0; i < valuesToCheck; ++i) {
        ASSERT_EQUAL_TOL(original[i].x, scale*result[i].x, 1e-4);
        ASSERT_EQUAL_TOL(original[i].y, scale*result[i].y, 1e-4);
    }
@@ -101,10 +112,20 @@ int main(int argc, char* argv[]) {
    try {
        if (argc > 1)
            platform.setPropertyDefaultValue("OpenCLPrecision", string(argv[1]));
-        if (platform.getPropertyDefaultValue("OpenCLPrecision") == "double")
-            testTransform<mm_double2>();
-        else
-            testTransform<mm_float2>();
+        if (platform.getPropertyDefaultValue("OpenCLPrecision") == "double") {
+            testTransform<mm_double2>(false, 28, 25, 30);
+            testTransform<mm_double2>(true, 28, 25, 25);
+            testTransform<mm_double2>(true, 25, 28, 25);
+            testTransform<mm_double2>(true, 25, 25, 28);
+            testTransform<mm_double2>(true, 21, 25, 27);
+        }
+        else {
+            testTransform<mm_float2>(false, 28, 25, 30);
+            testTransform<mm_float2>(true, 28, 25, 25);
+            testTransform<mm_float2>(true, 25, 28, 25);
+            testTransform<mm_float2>(true, 25, 25, 28);
+            testTransform<mm_float2>(true, 21, 25, 27);
+        }
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;