Optimization for Nvidia

99ed3ba7 · Peter Eastman · 4959bb9c · 99ed3ba7 · 99ed3ba7
Commit 99ed3ba7 authored Apr 03, 2015 by Peter Eastman
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 1 deletion

platforms/opencl/src/OpenCLKernels.cpp platforms/opencl/src/OpenCLKernels.cpp +4 -1

platforms/opencl/src/kernels/pme.cl platforms/opencl/src/kernels/pme.cl +10 -0

No files found.
--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1666,7 +1666,10 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
                sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms());
                fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ, true);
                string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>();
-                usePmeQueue = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
+                bool isNvidia = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
+                if (isNvidia)
+                    pmeDefines["USE_ALTERNATE_MEMORY_ACCESS_PATTERN"] = "1";
+                usePmeQueue = isNvidia;
                if (usePmeQueue) {
                    pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice());
                    int recipForceGroup = force.getReciprocalSpaceForceGroup();

--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
@@ -138,7 +138,13 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
                    zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
                    int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
                    real add = pos.w*data[ix].x*data[iy].y*data[iz].z;
+#ifdef USE_ALTERNATE_MEMORY_ACCESS_PATTERN
+                    // On Nvidia devices (at least Maxwell anyway), this split ordering produces much higher performance.  Why?
+                    // I have no idea!  And of course on AMD it produces slower performance.  GPUs are not meant to be understood.
+                    atom_add(&pmeGrid[index%2 == 0 ? index/2 : (index+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2], (long) (add*0x100000000));
+#else
                    atom_add(&pmeGrid[index], (long) (add*0x100000000));
+#endif
                }
            }
        }
@@ -149,7 +155,11 @@ __kernel void finishSpreadCharge(__global long* restrict fixedGrid, __global rea
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    real scale = EPSILON_FACTOR/(real) 0x100000000;
    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
+#ifdef USE_ALTERNATE_MEMORY_ACCESS_PATTERN
+        long value = fixedGrid[index%2 == 0 ? index/2 : (index+gridSize)/2];
+#else
        long value = fixedGrid[index];
+#endif
        realGrid[index] = (real) (value*scale);
    }
 }