Merge pull request #869 from peastman/fft

OpenCL FFT supports real-to-complex transforms

Merge pull request #869 from peastman/fft
OpenCL FFT supports real-to-complex transforms
e531310f · peastman · f2a9c210 · 99ed3ba7 · e531310f · e531310f
Commit e531310f authored Apr 06, 2015 by peastman
6 changed files
--- a/platforms/opencl/include/OpenCLFFT3D.h
+++ b/platforms/opencl/include/OpenCLFFT3D.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -40,7 +40,7 @@ namespace OpenMM {
 * 15, 207–228 (2000).
 * <p>
 * This class places certain restrictions on the allowed dimensions of the grid.  First,
- * the size of each dimension may have no prime factors other than 2, 3, and 5.  You
+ * the size of each dimension may have no prime factors other than 2, 3, 5, and 7.  You
 * can call findLegalDimension() to determine the smallest size that satisfies this
 * requirement and is greater than or equal to a specified minimum size.  Second, the size
 * of each dimension must be small enough to compute each 1D transform entirely in local
@@ -61,12 +61,14 @@ public:
     * @param xsize   the first dimension of the data sets on which FFTs will be performed
     * @param ysize   the second dimension of the data sets on which FFTs will be performed
     * @param zsize   the third dimension of the data sets on which FFTs will be performed
+     * @param realToComplex  if true, a real-to-complex transform will be done.  Otherwise, it is complex-to-complex.
     */
-    OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize);
+    OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize, bool realToComplex=false);
    /**
     * Perform a Fourier transform.  The transform cannot be done in-place: the input and output
     * arrays must be different.  Also, the input array is used as workspace, so its contents
-     * are destroyed.
+     * are destroyed.  This also means that both arrays must be large enough to hold complex values,
+     * even when performing a real-to-complex transform.
     *
     * @param in       the data to transform, ordered such that in[x*ysize*zsize + y*zsize + z] contains element (x, y, z)
     * @param out      on exit, this contains the transformed data
@@ -75,17 +77,19 @@ public:
    void execFFT(OpenCLArray& in, OpenCLArray& out, bool forward = true);
    /**
     * Get the smallest legal size for a dimension of the grid (that is, a size with no prime
-     * factors other than 2, 3, and 5).
+     * factors other than 2, 3, 5, and 7).
     *
     * @param minimum   the minimum size the return value must be greater than or equal to
     */
    static int findLegalDimension(int minimum);
 private:
-    cl::Kernel createKernel(int xsize, int ysize, int zsize, int& threads);
+    cl::Kernel createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward);
    int xsize, ysize, zsize;
    int xthreads, ythreads, zthreads;
+    bool realToComplex;
    OpenCLContext& context;
    cl::Kernel xkernel, ykernel, zkernel;
+    cl::Kernel invxkernel, invykernel, invzkernel;
 };

 } // namespace OpenMM

--- a/platforms/opencl/src/OpenCLFFT3D.cpp
+++ b/platforms/opencl/src/OpenCLFFT3D.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009-2012 Stanford University and the Authors.      *
+ * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -35,25 +35,29 @@
 using namespace OpenMM;
 using namespace std;

-OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize) : context(context), xsize(xsize), ysize(ysize), zsize(zsize) {
-    zkernel = createKernel(xsize, ysize, zsize, zthreads);
-    xkernel = createKernel(ysize, zsize, xsize, xthreads);
-    ykernel = createKernel(zsize, xsize, ysize, ythreads);
+OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize, bool realToComplex) :
+        context(context), xsize(xsize), ysize(ysize), zsize(zsize), realToComplex(realToComplex) {
+    zkernel = createKernel(xsize, ysize, zsize, zthreads, 0, true);
+    xkernel = createKernel(ysize, zsize, xsize, xthreads, 1, true);
+    ykernel = createKernel(zsize, xsize, ysize, ythreads, 2, true);
+    invzkernel = createKernel(xsize, ysize, zsize, zthreads, 0, false);
+    invxkernel = createKernel(ysize, zsize, xsize, xthreads, 1, false);
+    invykernel = createKernel(zsize, xsize, ysize, ythreads, 2, false);
 }

 void OpenCLFFT3D::execFFT(OpenCLArray& in, OpenCLArray& out, bool forward) {
-    zkernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
-    zkernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
-    zkernel.setArg<cl_int>(2, forward ? 1 : -1);
-    context.executeKernel(zkernel, xsize*ysize*zsize, zthreads);
-    xkernel.setArg<cl::Buffer>(0, out.getDeviceBuffer());
-    xkernel.setArg<cl::Buffer>(1, in.getDeviceBuffer());
-    xkernel.setArg<cl_int>(2, forward ? 1 : -1);
-    context.executeKernel(xkernel, xsize*ysize*zsize, xthreads);
-    ykernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
-    ykernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
-    ykernel.setArg<cl_int>(2, forward ? 1 : -1);
-    context.executeKernel(ykernel, xsize*ysize*zsize, ythreads);
+    cl::Kernel kernel1 = (forward ? zkernel : invzkernel);
+    cl::Kernel kernel2 = (forward ? xkernel : invxkernel);
+    cl::Kernel kernel3 = (forward ? ykernel : invykernel);
+    kernel1.setArg<cl::Buffer>(0, in.getDeviceBuffer());
+    kernel1.setArg<cl::Buffer>(1, out.getDeviceBuffer());
+    context.executeKernel(kernel1, xsize*ysize*zsize, zthreads);
+    kernel2.setArg<cl::Buffer>(0, out.getDeviceBuffer());
+    kernel2.setArg<cl::Buffer>(1, in.getDeviceBuffer());
+    context.executeKernel(kernel2, xsize*ysize*zsize, xthreads);
+    kernel3.setArg<cl::Buffer>(0, in.getDeviceBuffer());
+    kernel3.setArg<cl::Buffer>(1, out.getDeviceBuffer());
+    context.executeKernel(kernel3, xsize*ysize*zsize, ythreads);
 }

 int OpenCLFFT3D::findLegalDimension(int minimum) {
@@ -73,7 +77,7 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
    }
 }

-cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads) {
+cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward) {
    int maxThreads = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
    bool isCPU = context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU;
    while (true) {
@@ -137,10 +141,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
                source<<"real2 b2 = "<<context.doubleToString((2*cos(2*M_PI/7)-cos(4*M_PI/7)-cos(6*M_PI/7))/3)<<"*(d0-d4);\n";
                source<<"real2 b3 = "<<context.doubleToString((cos(2*M_PI/7)-2*cos(4*M_PI/7)+cos(6*M_PI/7))/3)<<"*(d4-d2);\n";
                source<<"real2 b4 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)-2*cos(6*M_PI/7))/3)<<"*(d2-d0);\n";
-                source<<"real2 b5 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n";
-                source<<"real2 b6 = -sign*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n";
-                source<<"real2 b7 = -sign*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n";
-                source<<"real2 b8 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n";
+                source<<"real2 b5 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n";
+                source<<"real2 b6 = -(SIGN)*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n";
+                source<<"real2 b7 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n";
+                source<<"real2 b8 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n";
                source<<"real2 t0 = b0+b1;\n";
                source<<"real2 t1 = b2+b3;\n";
                source<<"real2 t2 = b4-b3;\n";
@@ -178,8 +182,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
                source<<"real2 d7 = d6+d5;\n";
                source<<"real2 d8 = d6-d5;\n";
                string coeff = context.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
-                source<<"real2 d9 = sign*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
-                source<<"real2 d10 = sign*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
+                source<<"real2 d9 = (SIGN)*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
+                source<<"real2 d10 = (SIGN)*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
                source<<"data"<<output<<"[base+4*j*"<<m<<"] = c0+d4;\n";
                source<<"data"<<output<<"[base+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
                source<<"data"<<output<<"[base+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
@@ -194,7 +198,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
                source<<"real2 d0 = c0+c2;\n";
                source<<"real2 d1 = c0-c2;\n";
                source<<"real2 d2 = c1+c3;\n";
-                source<<"real2 d3 = sign*(real2) (c1.y-c3.y, c3.x-c1.x);\n";
+                source<<"real2 d3 = (SIGN)*(real2) (c1.y-c3.y, c3.x-c1.x);\n";
                source<<"data"<<output<<"[base+3*j*"<<m<<"] = d0+d2;\n";
                source<<"data"<<output<<"[base+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
                source<<"data"<<output<<"[base+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
@@ -206,7 +210,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
                source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
                source<<"real2 d0 = c1+c2;\n";
                source<<"real2 d1 = c0-0.5f*d0;\n";
-                source<<"real2 d2 = sign*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n";
+                source<<"real2 d2 = (SIGN)*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n";
                source<<"data"<<output<<"[base+2*j*"<<m<<"] = c0+d0;\n";
                source<<"data"<<output<<"[base+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
                source<<"data"<<output<<"[base+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
@@ -226,13 +230,15 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa

        // Create the kernel.

+        bool outputIsReal = (realToComplex && axis == 2 && !forward);
+        string outputSuffix = (outputIsReal ? ".x" : "");
        if (loopRequired) {
            source<<"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))\n";
-            source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z];\n";
+            source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z]"<<outputSuffix<<";\n";
        }
        else {
            source<<"if (index < XSIZE*YSIZE)\n";
-            source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n";
+            source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n";
        }
        map<string, string> replacements;
        replacements["XSIZE"] = context.intToString(xsize);
@@ -242,6 +248,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
        replacements["M_PI"] = context.doubleToString(M_PI);
        replacements["COMPUTE_FFT"] = source.str();
        replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0");
+        replacements["SIGN"] = (forward ? "1" : "-1");
+        replacements["INPUT_TYPE"] = (realToComplex && axis == 0 && forward ? "real" : "real2");
+        replacements["OUTPUT_TYPE"] = (outputIsReal ? "real" : "real2");
+        replacements["INPUT_IS_REAL"] = (realToComplex && axis == 0 && forward ? "1" : "0");
        cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements));
        cl::Kernel kernel(program, "execFFT");
        threads = (isCPU ? 1 : blocksPerGroup*zsize);
@@ -253,9 +263,9 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
            continue;
        }
        int bufferSize = blocksPerGroup*zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
+        kernel.setArg(2, bufferSize, NULL);
        kernel.setArg(3, bufferSize, NULL);
        kernel.setArg(4, bufferSize, NULL);
-        kernel.setArg(5, bufferSize, NULL);
        return kernel;
    }
 }
--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1652,8 +1652,11 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb

                int elementSize = (cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
                pmeGrid = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
-                cl.addAutoclearBuffer(*pmeGrid);
                pmeGrid2 = new OpenCLArray(cl, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid2");
+                if (cl.getSupports64BitGlobalAtomics())
+                    cl.addAutoclearBuffer(*pmeGrid2);
+                else
+                    cl.addAutoclearBuffer(*pmeGrid);
                pmeBsplineModuliX = new OpenCLArray(cl, gridSizeX, elementSize, "pmeBsplineModuliX");
                pmeBsplineModuliY = new OpenCLArray(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
                pmeBsplineModuliZ = new OpenCLArray(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
@@ -1661,9 +1664,12 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
                pmeAtomRange = OpenCLArray::create<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
                pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
                sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms());
-                fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);
+                fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ, true);
                string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>();
-                usePmeQueue = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
+                bool isNvidia = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
+                if (isNvidia)
+                    pmeDefines["USE_ALTERNATE_MEMORY_ACCESS_PATTERN"] = "1";
+                usePmeQueue = isNvidia;
                if (usePmeQueue) {
                    pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice());
                    int recipForceGroup = force.getReciprocalSpaceForceGroup();
@@ -1814,7 +1820,10 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
            pmeSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex->getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeAtomRange->getDeviceBuffer());
-            pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid->getDeviceBuffer());
+            if (cl.getSupports64BitGlobalAtomics())
+                pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid2->getDeviceBuffer());
+            else
+                pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid->getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta->getDeviceBuffer());
            pmeConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
            pmeConvolutionKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
@@ -1827,7 +1836,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
            pmeInterpolateForceKernel.setArg<cl::Buffer>(7, pmeAtomGridIndex->getDeviceBuffer());
            if (cl.getSupports64BitGlobalAtomics()) {
                pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
-                pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid->getDeviceBuffer());
+                pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
+                pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid->getDeviceBuffer());
            }
       }
    }

--- a/platforms/opencl/src/kernels/fft.cl
+++ b/platforms/opencl/src/kernels/fft.cl
@@ -6,10 +6,10 @@ real2 multiplyComplex(real2 c1, real2 c2) {
 * Perform a 1D FFT on each row along one axis.
 */

-__kernel void execFFT(__global const real2* restrict in, __global real2* restrict out, int sign, __local real2* restrict w,
+__kernel void execFFT(__global const INPUT_TYPE* restrict in, __global OUTPUT_TYPE* restrict out, __local real2* restrict w,
        __local real2* restrict data0, __local real2* restrict data1) {
    for (int i = get_local_id(0); i < ZSIZE; i += get_local_size(0))
-        w[i] = (real2) (cos(-sign*i*2*M_PI/ZSIZE), sin(-sign*i*2*M_PI/ZSIZE));
+        w[i] = (real2) (cos(-(SIGN)*i*2*M_PI/ZSIZE), sin(-(SIGN)*i*2*M_PI/ZSIZE));
    barrier(CLK_LOCAL_MEM_FENCE);
    
    for (int baseIndex = get_group_id(0)*BLOCKS_PER_GROUP; baseIndex < XSIZE*YSIZE; baseIndex += get_num_groups(0)*BLOCKS_PER_GROUP) {
@@ -18,10 +18,18 @@ __kernel void execFFT(__global const real2* restrict in, __global real2* restric
        int y = index-x*YSIZE;
 #if LOOP_REQUIRED
        for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))
+    #if INPUT_IS_REAL
+            data0[z] = (real2) (in[x*(YSIZE*ZSIZE)+y*ZSIZE+z], 0);
+    #else
            data0[z] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+z];
+    #endif
 #else
        if (index < XSIZE*YSIZE)
+    #if INPUT_IS_REAL
+            data0[get_local_id(0)] = (real2) (in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE], 0);
+    #else
            data0[get_local_id(0)] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE];
+    #endif
 #endif
        barrier(CLK_LOCAL_MEM_FENCE);
        COMPUTE_FFT

--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
@@ -138,35 +138,34 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
                    real add = pos.w*data[ix].x*data[iy].y*data[iz].z;
-#ifdef USE_DOUBLE_PRECISION
-                    atom_add(&pmeGrid[2*index], (long) (add*0x100000000));
+#ifdef USE_ALTERNATE_MEMORY_ACCESS_PATTERN
+                    // On Nvidia devices (at least Maxwell anyway), this split ordering produces much higher performance.  Why?
+                    // I have no idea!  And of course on AMD it produces slower performance.  GPUs are not meant to be understood.
+                    atom_add(&pmeGrid[index%2 == 0 ? index/2 : (index+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2], (long) (add*0x100000000));
 #else
                    atom_add(&pmeGrid[index], (long) (add*0x100000000));
 #endif
-
                }
            }
        }
    }
 }

-__kernel void finishSpreadCharge(__global long* restrict pmeGrid) {
-    __global real2* realGrid = (__global real2*) pmeGrid;
+__kernel void finishSpreadCharge(__global long* restrict fixedGrid, __global real* restrict realGrid) {
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    real scale = EPSILON_FACTOR/(real) 0x100000000;
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
-#ifdef USE_DOUBLE_PRECISION
-        long value = pmeGrid[2*index];
+#ifdef USE_ALTERNATE_MEMORY_ACCESS_PATTERN
+        long value = fixedGrid[index%2 == 0 ? index/2 : (index+gridSize)/2];
 #else
-        long value = pmeGrid[index];
+        long value = fixedGrid[index];
 #endif
-        real2 realValue = (real2) ((real) (value*scale), 0);
-        realGrid[index] = realValue;
+        realGrid[index] = (real) (value*scale);
    }
 }
 #elif defined(DEVICE_IS_CPU)
 __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
-        __global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
+        __global real* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
    const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0);
    const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0);
    if (firstx == lastx)
@@ -230,7 +229,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
-                    pmeGrid[index].x += EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z;
+                    pmeGrid[index] += EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z;
                }
            }
        }
@@ -238,7 +237,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
 }
 #else
 __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
-        __global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
+        __global real* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
    unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
        // Compute the charge on a grid point.
@@ -290,7 +289,7 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
                }
            }
        }
-        pmeGrid[gridIndex] = (real2) (result*EPSILON_FACTOR, 0);
+        pmeGrid[gridIndex] = result*EPSILON_FACTOR;
    }
 }
 #endif
@@ -325,7 +324,7 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global r
    energyBuffer[get_global_id(0)] += 0.5f*energy;
 }

-__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real2* restrict pmeGrid,
+__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real* restrict pmeGrid,
        real4 periodicBoxSize, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, __global int2* restrict pmeAtomGridIndex) {
    const real4 scale = 1/(real) (PME_ORDER-1);
    real4 data[PME_ORDER];
@@ -385,7 +384,7 @@ __kernel void gridInterpolateForce(__global const real4* restrict posq, __global
                    int zindex = gridIndex.z+iz;
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
-                    real gridvalue = pmeGrid[index].x;
+                    real gridvalue = pmeGrid[index];
                    force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue;
                    force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue;
                    force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;

--- a/platforms/opencl/tests/TestOpenCLFFT.cpp
+++ b/platforms/opencl/tests/TestOpenCLFFT.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2011 Stanford University and the Authors.           *
+ * Portions copyright (c) 2011-2015 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -51,7 +51,7 @@ using namespace std;
 static OpenCLPlatform platform;

 template <class Real2>
-void testTransform() {
+void testTransform(bool realToComplex) {
    System system;
    system.addParticle(0.0);
    OpenCLPlatform::PlatformData platformData(system, "", "", platform.getPropertyDefaultValue("OpenCLPrecision"), "false");
@@ -67,10 +67,16 @@ void testTransform() {
        original[i] = value;
        reference[i] = t_complex(value.x, value.y);
    }
+    for (int i = 0; i < (int) reference.size(); i++) {
+        if (realToComplex)
+            reference[i] = t_complex(i%2 == 0 ? original[i/2].x : original[i/2].y, 0);
+        else
+            reference[i] = t_complex(original[i].x, original[i].y);
+    }
    OpenCLArray grid1(context, original.size(), sizeof(Real2), "grid1");
    OpenCLArray grid2(context, original.size(), sizeof(Real2), "grid2");
    grid1.upload(original);
-    OpenCLFFT3D fft(context, xsize, ysize, zsize);
+    OpenCLFFT3D fft(context, xsize, ysize, zsize, realToComplex);

    // Perform a forward FFT, then verify the result is correct.

@@ -91,7 +97,8 @@ void testTransform() {
    fft.execFFT(grid2, grid1, false);
    grid1.download(result);
    double scale = 1.0/(xsize*ysize*zsize);
-    for (int i = 0; i < (int) result.size(); ++i) {
+    int valuesToCheck = (realToComplex ? original.size()/2 : original.size());
+    for (int i = 0; i < valuesToCheck; ++i) {
        ASSERT_EQUAL_TOL(original[i].x, scale*result[i].x, 1e-4);
        ASSERT_EQUAL_TOL(original[i].y, scale*result[i].y, 1e-4);
    }
@@ -101,10 +108,14 @@ int main(int argc, char* argv[]) {
    try {
        if (argc > 1)
            platform.setPropertyDefaultValue("OpenCLPrecision", string(argv[1]));
-        if (platform.getPropertyDefaultValue("OpenCLPrecision") == "double")
-            testTransform<mm_double2>();
-        else
-            testTransform<mm_float2>();
+        if (platform.getPropertyDefaultValue("OpenCLPrecision") == "double") {
+            testTransform<mm_double2>(false);
+            testTransform<mm_double2>(true);
+        }
+        else {
+            testTransform<mm_float2>(false);
+            testTransform<mm_float2>(true);
+        }
    }
    catch(const exception& e) {
        cout << "exception: " << e.what() << endl;