Merge pull request #874 from peastman/packedfft

OpenCL FFT stores only non-redundant elements

Merge pull request #874 from peastman/packedfft
OpenCL FFT stores only non-redundant elements
f0ef7b5b · peastman · 2db04610 · 63722d0a · f0ef7b5b · f0ef7b5b
Commit f0ef7b5b authored Apr 15, 2015 by peastman
8 changed files
--- a/platforms/opencl/include/OpenCLFFT3D.h
+++ b/platforms/opencl/include/OpenCLFFT3D.h
@@ -69,6 +69,9 @@ public:
     * arrays must be different.  Also, the input array is used as workspace, so its contents
     * are destroyed.  This also means that both arrays must be large enough to hold complex values,
     * even when performing a real-to-complex transform.
+     * <p>
+     * When performing a real-to-complex transform, the output data is of size xsize*ysize*(zsize/2+1)
+     * and contains only the non-redundant elements.
     *
     * @param in       the data to transform, ordered such that in[x*ysize*zsize + y*zsize + z] contains element (x, y, z)
     * @param out      on exit, this contains the transformed data

--- a/platforms/opencl/include/OpenCLKernels.h
+++ b/platforms/opencl/include/OpenCLKernels.h
@@ -639,6 +639,7 @@ private:
    cl::Kernel pmeSpreadChargeKernel;
    cl::Kernel pmeFinishSpreadChargeKernel;
    cl::Kernel pmeConvolutionKernel;
+    cl::Kernel pmeEvalEnergyKernel;
    cl::Kernel pmeInterpolateForceKernel;
    std::map<std::string, std::string> pmeDefines;
    std::vector<std::pair<int, int> > exceptionAtoms;

--- a/platforms/opencl/src/OpenCLFFT3D.cpp
+++ b/platforms/opencl/src/OpenCLFFT3D.cpp
@@ -310,14 +310,26 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
        // Create the kernel.

        bool outputIsReal = (inputIsReal && axis == 2 && !forward);
+        bool outputIsPacked = (inputIsReal && axis == 2 && forward);
        string outputSuffix = (outputIsReal ? ".x" : "");
        if (loopRequired) {
+            if (outputIsPacked)
+                source<<"if (x < XSIZE/2+1)\n";
            source<<"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))\n";
-            source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z]"<<outputSuffix<<";\n";
+            if (outputIsPacked)
+                source<<"out[y*(ZSIZE*(XSIZE/2+1))+z*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[z]"<<outputSuffix<<";\n";
+            else
+                source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z]"<<outputSuffix<<";\n";
        }
        else {
-            source<<"if (index < XSIZE*YSIZE)\n";
-            source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n";
+            if (outputIsPacked) {
+                source<<"if (index < XSIZE*YSIZE && x < XSIZE/2+1)\n";
+                source<<"out[y*(ZSIZE*(XSIZE/2+1))+(get_local_id(0)%ZSIZE)*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n";
+            }
+            else {
+                source<<"if (index < XSIZE*YSIZE)\n";
+                source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)]"<<outputSuffix<<";\n";
+            }
        }
        map<string, string> replacements;
        replacements["XSIZE"] = context.intToString(xsize);
@@ -331,6 +343,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
        replacements["INPUT_TYPE"] = (inputIsReal && axis == 0 && forward ? "real" : "real2");
        replacements["OUTPUT_TYPE"] = (outputIsReal ? "real" : "real2");
        replacements["INPUT_IS_REAL"] = (inputIsReal && axis == 0 && forward ? "1" : "0");
+        replacements["INPUT_IS_PACKED"] = (inputIsReal && axis == 0 && !forward ? "1" : "0");
+        replacements["OUTPUT_IS_PACKED"] = (outputIsPacked ? "1" : "0");
        cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements));
        cl::Kernel kernel(program, "execFFT");
        threads = (isCPU ? 1 : blocksPerGroup*zsize);

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1806,6 +1806,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
            pmeZIndexKernel = cl::Kernel(program, "recordZIndex");
            pmeSpreadChargeKernel = cl::Kernel(program, "gridSpreadCharge");
            pmeConvolutionKernel = cl::Kernel(program, "reciprocalConvolution");
+            pmeEvalEnergyKernel = cl::Kernel(program, "gridEvaluateEnergy");
            pmeInterpolateForceKernel = cl::Kernel(program, "gridInterpolateForce");
            int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
            pmeUpdateBsplinesKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
@@ -1826,10 +1827,14 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
                pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeGrid->getDeviceBuffer());
            pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta->getDeviceBuffer());
            pmeConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
-            pmeConvolutionKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
-            pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer());
-            pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer());
-            pmeConvolutionKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer());
+            pmeConvolutionKernel.setArg<cl::Buffer>(1, pmeBsplineModuliX->getDeviceBuffer());
+            pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliY->getDeviceBuffer());
+            pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliZ->getDeviceBuffer());
+            pmeEvalEnergyKernel.setArg<cl::Buffer>(0, pmeGrid2->getDeviceBuffer());
+            pmeEvalEnergyKernel.setArg<cl::Buffer>(1, cl.getEnergyBuffer().getDeviceBuffer());
+            pmeEvalEnergyKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer());
+            pmeEvalEnergyKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer());
+            pmeEvalEnergyKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer());
            pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
            pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
            pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid->getDeviceBuffer());
@@ -1861,7 +1866,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
        cl.executeKernel(ewaldForcesKernel, cl.getNumAtoms());
    }
    if (pmeGrid != NULL && includeReciprocal) {
-        if (usePmeQueue)
+        if (usePmeQueue && !includeEnergy)
            cl.setQueue(pmeQueue);
        
        // Invert the periodic box vectors.
@@ -1936,19 +1941,24 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
        }
        fft->execFFT(*pmeGrid, *pmeGrid2, true);
        mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
-        double scaleFactor = 1.0/(M_PI*boxSize.x*boxSize.y*boxSize.z);
        if (cl.getUseDoublePrecision()) {
-            pmeConvolutionKernel.setArg<mm_double4>(5, recipBoxVectors[0]);
-            pmeConvolutionKernel.setArg<mm_double4>(6, recipBoxVectors[1]);
-            pmeConvolutionKernel.setArg<mm_double4>(7, recipBoxVectors[2]);
-            pmeConvolutionKernel.setArg<cl_double>(8, scaleFactor);
+            pmeConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
+            pmeConvolutionKernel.setArg<mm_double4>(5, recipBoxVectors[1]);
+            pmeConvolutionKernel.setArg<mm_double4>(6, recipBoxVectors[2]);
+            pmeEvalEnergyKernel.setArg<mm_double4>(5, recipBoxVectors[0]);
+            pmeEvalEnergyKernel.setArg<mm_double4>(6, recipBoxVectors[1]);
+            pmeEvalEnergyKernel.setArg<mm_double4>(7, recipBoxVectors[2]);
        }
        else {
-            pmeConvolutionKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[0]);
-            pmeConvolutionKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[1]);
-            pmeConvolutionKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]);
-            pmeConvolutionKernel.setArg<cl_float>(8, (float) scaleFactor);
-        }
+            pmeConvolutionKernel.setArg<mm_float4>(4, recipBoxVectorsFloat[0]);
+            pmeConvolutionKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[1]);
+            pmeConvolutionKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[2]);
+            pmeEvalEnergyKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[0]);
+            pmeEvalEnergyKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[1]);
+            pmeEvalEnergyKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]);
+        }
+        if (includeEnergy)
+            cl.executeKernel(pmeEvalEnergyKernel, cl.getNumAtoms());
        cl.executeKernel(pmeConvolutionKernel, cl.getNumAtoms());
        fft->execFFT(*pmeGrid2, *pmeGrid, false);
        setPeriodicBoxSizeArg(cl, pmeInterpolateForceKernel, 3);

--- a/platforms/opencl/src/kernels/fft.cl
+++ b/platforms/opencl/src/kernels/fft.cl
@@ -2,6 +2,19 @@ real2 multiplyComplex(real2 c1, real2 c2) {
    return (real2) (c1.x*c2.x-c1.y*c2.y, c1.x*c2.y+c1.y*c2.x);
 }

+/**
+ * Load a value from the half-complex grid produces by a real-to-complex transform.
+ */
+real2 loadComplexValue(__global const real2* restrict in, int x, int y, int z) {
+    const int inputZSize = ZSIZE/2+1;
+    if (z < inputZSize)
+        return in[x*YSIZE*inputZSize+y*inputZSize+z];
+    int xp = (x == 0 ? 0 : XSIZE-x);
+    int yp = (y == 0 ? 0 : YSIZE-y);
+    real2 value = in[xp*YSIZE*inputZSize+yp*inputZSize+(ZSIZE-z)];
+    return (real2) (value.x, -value.y);
+}
+
 /**
 * Perform a 1D FFT on each row along one axis.
 */
@@ -16,10 +29,15 @@ __kernel void execFFT(__global const INPUT_TYPE* restrict in, __global OUTPUT_TY
        int index = baseIndex+get_local_id(0)/ZSIZE;
        int x = index/YSIZE;
        int y = index-x*YSIZE;
+#if OUTPUT_IS_PACKED
+        if (x < XSIZE/2+1) {
+#endif
 #if LOOP_REQUIRED
        for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))
    #if INPUT_IS_REAL
            data0[z] = (real2) (in[x*(YSIZE*ZSIZE)+y*ZSIZE+z], 0);
+    #elif INPUT_IS_PACKED
+            data0[z] = loadComplexValue(in, x, y, z);
    #else
            data0[z] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+z];
    #endif
@@ -27,11 +45,16 @@ __kernel void execFFT(__global const INPUT_TYPE* restrict in, __global OUTPUT_TY
        if (index < XSIZE*YSIZE)
    #if INPUT_IS_REAL
            data0[get_local_id(0)] = (real2) (in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE], 0);
+    #elif INPUT_IS_PACKED
+            data0[get_local_id(0)] = loadComplexValue(in, x, y, get_local_id(0)%ZSIZE);
    #else
            data0[get_local_id(0)] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+get_local_id(0)%ZSIZE];
    #endif
 #endif
        barrier(CLK_LOCAL_MEM_FENCE);
+#if OUTPUT_IS_PACKED
+        }
+#endif
        COMPUTE_FFT
    }
 }
--- a/platforms/opencl/src/kernels/fftR2C.cl
+++ b/platforms/opencl/src/kernels/fftR2C.cl
@@ -40,6 +40,7 @@ __kernel void unpackForwardData(__global const real2* restrict in, __global real
    // Transform the data.
    
    const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
+    const int outputZSize = ZSIZE/2+1;
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
        int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
        int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
@@ -58,25 +59,41 @@ __kernel void unpackForwardData(__global const real2* restrict in, __global real
        real2 wfac = w[z];
 #endif
        real2 output = (real2) ((z1.x+z2.x - wfac.x*(z1.x-z2.x) + wfac.y*(z1.y+z2.y))/2, (z1.y-z2.y - wfac.y*(z1.x-z2.x) - wfac.x*(z1.y+z2.y))/2);
-        out[x*YSIZE*ZSIZE+y*ZSIZE+z] = output;
+        if (z < outputZSize)
+            out[x*YSIZE*outputZSize+y*outputZSize+z] = output;
        xp = (x == 0 ? 0 : XSIZE-x);
        yp = (y == 0 ? 0 : YSIZE-y);
        zp = (z == 0 ? 0 : ZSIZE-z);
+        if (zp < outputZSize) {
 #if PACKED_AXIS == 0
-        if (x == 0)
-            out[PACKED_XSIZE*YSIZE*ZSIZE+yp*ZSIZE+zp] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
+            if (x == 0)
+                out[PACKED_XSIZE*YSIZE*outputZSize+yp*outputZSize+zp] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
 #elif PACKED_AXIS == 1
-        if (y == 0)
-            out[xp*YSIZE*ZSIZE+PACKED_YSIZE*ZSIZE+zp] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
+            if (y == 0)
+                out[xp*YSIZE*outputZSize+PACKED_YSIZE*outputZSize+zp] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
 #else
-        if (z == 0)
-            out[xp*YSIZE*ZSIZE+yp*ZSIZE+PACKED_ZSIZE] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
+            if (z == 0)
+                out[xp*YSIZE*outputZSize+yp*outputZSize+PACKED_ZSIZE] = (real2) ((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
 #endif
-        else
-            out[xp*YSIZE*ZSIZE+yp*ZSIZE+zp] = (real2) (output.x, -output.y);
+            else
+                out[xp*YSIZE*outputZSize+yp*outputZSize+zp] = (real2) (output.x, -output.y);
+        }
    }
 }

+/**
+ * Load a value from the half-complex grid produced by a real-to-complex transform.
+ */
+real2 loadComplexValue(__global const real2* restrict in, int x, int y, int z) {
+    const int inputZSize = ZSIZE/2+1;
+    if (z < inputZSize)
+        return in[x*YSIZE*inputZSize+y*inputZSize+z];
+    int xp = (x == 0 ? 0 : XSIZE-x);
+    int yp = (y == 0 ? 0 : YSIZE-y);
+    real2 value = in[xp*YSIZE*inputZSize+yp*inputZSize+(ZSIZE-z)];
+    return (real2) (value.x, -value.y);
+}
+
 /**
 * Repack the symmetric complex grid into one half as large in preparation for doing an inverse complex-to-real transform.
 */
@@ -106,16 +123,16 @@ __kernel void packBackwardData(__global const real2* restrict in, __global real2
        int xp = (x == 0 ? 0 : PACKED_XSIZE-x);
        int yp = (y == 0 ? 0 : PACKED_YSIZE-y);
        int zp = (z == 0 ? 0 : PACKED_ZSIZE-z);
-        real2 z1 = in[x*YSIZE*ZSIZE+y*ZSIZE+z];
+        real2 z1 = loadComplexValue(in, x, y, z);
 #if PACKED_AXIS == 0
        real2 wfac = w[x];
-        real2 z2 = in[(PACKED_XSIZE-x)*YSIZE*ZSIZE+yp*ZSIZE+zp];
+        real2 z2 = loadComplexValue(in, PACKED_XSIZE-x, yp, zp);
 #elif PACKED_AXIS == 1
        real2 wfac = w[y];
-        real2 z2 = in[xp*YSIZE*ZSIZE+(PACKED_YSIZE-y)*ZSIZE+zp];
+        real2 z2 = loadComplexValue(in, xp, PACKED_YSIZE-y, zp);
 #else
        real2 wfac = w[z];
-        real2 z2 = in[xp*YSIZE*ZSIZE+yp*ZSIZE+(PACKED_ZSIZE-z)];
+        real2 z2 = loadComplexValue(in, xp, yp, PACKED_ZSIZE-z);
 #endif
        real2 even = (real2) ((z1.x+z2.x)/2, (z1.y-z2.y)/2);
        real2 odd = (real2) ((z1.x-z2.x)/2, (z1.y+z2.y)/2);

--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
@@ -294,17 +294,18 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
 }
 #endif

-__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global real* restrict energyBuffer, __global const real* restrict pmeBsplineModuliX,
-        __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, real recipScaleFactor) {
-    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
-    real energy = 0.0f;
+__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global const real* restrict pmeBsplineModuliX,
+        __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
+    // R2C stores into a half complex matrix where the last dimension is cut by half
+    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
+    const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
+
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
-        int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z);
-        int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z;
-        int ky = remainder/GRID_SIZE_Z;
-        int kz = remainder-ky*GRID_SIZE_Z;
-        if (kx == 0 && ky == 0 && kz == 0)
-            continue;
+        // real indices
+        int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
+        int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
+        int ky = remainder/(GRID_SIZE_Z/2+1);
+        int kz = remainder-ky*(GRID_SIZE_Z/2+1);
        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
@@ -318,8 +319,48 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global r
        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
        real denom = m2*bx*by*bz;
        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
-        pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
-        energy += eterm*(grid.x*grid.x + grid.y*grid.y);
+        if (kx != 0 || ky != 0 || kz != 0) {
+            pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
+        }
+    }
+}
+
+__kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global real* restrict energyBuffer,
+                      __global const real* restrict pmeBsplineModuliX, __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ,
+                      real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
+    // R2C stores into a half complex matrix where the last dimension is cut by half
+    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
+    const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
+ 
+    real energy = 0;
+    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
+        // real indices
+        int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));
+        int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z);
+        int ky = remainder/(GRID_SIZE_Z);
+        int kz = remainder-ky*(GRID_SIZE_Z);
+        int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
+        int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
+        int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
+        real mhx = mx*recipBoxVecX.x;
+        real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
+        real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
+        real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
+        real bx = pmeBsplineModuliX[kx];
+        real by = pmeBsplineModuliY[ky];
+        real bz = pmeBsplineModuliZ[kz];
+        real denom = m2*bx*by*bz;
+        real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
+        if (kz >= (GRID_SIZE_Z/2+1)) {
+            kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
+            ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
+            kz = GRID_SIZE_Z-kz;
+        } 
+        int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
+        real2 grid = pmeGrid[indexInHalfComplexGrid];
+        if (kx != 0 || ky != 0 || kz != 0) {
+            energy += eterm*(grid.x*grid.x + grid.y*grid.y);
+        }
    }
    energyBuffer[get_global_id(0)] += 0.5f*energy;
 }

--- a/platforms/opencl/tests/TestOpenCLFFT.cpp
+++ b/platforms/opencl/tests/TestOpenCLFFT.cpp
@@ -85,10 +85,15 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
    fftpack_t plan;
    fftpack_init_3d(&plan, xsize, ysize, zsize);
    fftpack_exec_3d(plan, FFTPACK_FORWARD, &reference[0], &reference[0]);
-    for (int i = 0; i < (int) result.size(); ++i) {
-        ASSERT_EQUAL_TOL(reference[i].re, result[i].x, 1e-3);
-        ASSERT_EQUAL_TOL(reference[i].im, result[i].y, 1e-3);
-    }
+    int outputZSize = (realToComplex ? zsize/2+1 : zsize);
+    for (int x = 0; x < xsize; x++)
+        for (int y = 0; y < ysize; y++)
+            for (int z = 0; z < outputZSize; z++) {
+                int index1 = x*ysize*zsize + y*zsize + z;
+                int index2 = x*ysize*outputZSize + y*outputZSize + z;
+                ASSERT_EQUAL_TOL(reference[index1].re, result[index2].x, 1e-3);
+                ASSERT_EQUAL_TOL(reference[index1].im, result[index2].y, 1e-3);
+            }
    fftpack_destroy(plan);

    // Perform a backward transform and see if we get the original values.