Implemented faster version of charge spreading on GPUs that support 64 bit atomics

ec3ee586 · Peter Eastman · 548efdfb · ec3ee586 · ec3ee586 · ec3ee586
Commit ec3ee586 authored Jul 05, 2011 by Peter Eastman
5 changed files
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -48,6 +48,9 @@ using namespace std;
 #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
  #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
 #endif
+#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
+  #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
+#endif

 const int OpenCLContext::ThreadBlockSize = 64;
 const int OpenCLContext::TileSize = 32;
@@ -90,10 +93,20 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
            throw OpenMMException("The specified OpenCL device is not compatible with OpenMM");
        compilationOptions = "-DWORK_GROUP_SIZE="+OpenCLExpressionUtilities::intToString(ThreadBlockSize);
        defaultOptimizationOptions = "-cl-fast-relaxed-math";
+        supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos);
        string vendor = device.getInfo<CL_DEVICE_VENDOR>();
        if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") {
            compilationOptions += " -DWARPS_ARE_ATOMIC";
            simdWidth = 32;
+
+            // Compute level 1.2 and later Nvidia GPUs support 64 bit atomics, even though they don't list the
+            // proper extension as supported.
+
+            cl_uint computeCapabilityMajor, computeCapabilityMinor;
+            clGetDeviceInfo(device(), CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &computeCapabilityMajor, NULL);
+            clGetDeviceInfo(device(), CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &computeCapabilityMinor, NULL);
+            if (computeCapabilityMajor > 1 || computeCapabilityMinor > 1)
+                supports64BitGlobalAtomics = true;
        }
        else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") {
            // AMD APP SDK 2.4 has a performance problem with atomics. Enable the work around.
@@ -104,6 +117,8 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
        }
        else
            simdWidth = 1;
+        if (supports64BitGlobalAtomics)
+            compilationOptions += " -DSUPPORTS_64_BIT_ATOMICS";
        queue = cl::CommandQueue(context, device);
        numAtoms = numParticles;
        paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize);

--- a/platforms/opencl/src/OpenCLContext.h
+++ b/platforms/opencl/src/OpenCLContext.h
@@ -399,6 +399,12 @@ public:
    int getSIMDWidth() const {
        return simdWidth;
    }
+    /**
+     * Get whether the device being used supports 64 bit atomic operations on global memory.
+     */
+    bool getSupports64BitGlobalAtomics() {
+        return supports64BitGlobalAtomics;
+    }
    /**
     * Get the size of the periodic box.
     */
@@ -458,6 +464,7 @@ private:
    int numThreadBlocks;
    int numForceBuffers;
    int simdWidth;
+    bool supports64BitGlobalAtomics;
    mm_float4 periodicBoxSize;
    mm_float4 invPeriodicBoxSize;
    std::string compilationOptions, defaultOptimizationOptions;

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -1445,6 +1445,10 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
            pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeBsplineTheta->getDeviceBuffer());
            pmeInterpolateForceKernel.setArg<cl::Buffer>(3, pmeBsplineDtheta->getDeviceBuffer());
            pmeInterpolateForceKernel.setArg<cl::Buffer>(4, pmeGrid->getDeviceBuffer());
+            if (cl.getSupports64BitGlobalAtomics()) {
+                pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
+                pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid->getDeviceBuffer());
+            }
       }
    }
    if (exceptionIndices != NULL)
@@ -1476,7 +1480,15 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
            pmeAtomRangeKernel.setArg<mm_float4>(3, boxSize);
            pmeAtomRangeKernel.setArg<mm_float4>(4, invBoxSize);
            cl.executeKernel(pmeAtomRangeKernel, cl.getNumAtoms());
-            cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
+            if (cl.getSupports64BitGlobalAtomics()) {
+                cl.clearBuffer(pmeGrid->getDeviceBuffer(), pmeGrid->getSize()*2);
+                pmeSpreadChargeKernel.setArg<mm_float4>(5, boxSize);
+                pmeSpreadChargeKernel.setArg<mm_float4>(6, invBoxSize);
+                cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms(), PmeOrder*PmeOrder*PmeOrder);
+                cl.executeKernel(pmeFinishSpreadChargeKernel, pmeGrid->getSize());
+            }
+            else
+                cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
        }
        fft->execFFT(*pmeGrid, true);
        pmeConvolutionKernel.setArg<mm_float4>(5, invBoxSize);

--- a/platforms/opencl/src/OpenCLKernels.h
+++ b/platforms/opencl/src/OpenCLKernels.h
@@ -521,6 +521,7 @@ private:
    cl::Kernel pmeAtomRangeKernel;
    cl::Kernel pmeUpdateBsplinesKernel;
    cl::Kernel pmeSpreadChargeKernel;
+    cl::Kernel pmeFinishSpreadChargeKernel;
    cl::Kernel pmeConvolutionKernel;
    cl::Kernel pmeInterpolateForceKernel;
    std::map<std::string, std::string> pmeDefines;

--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
@@ -78,6 +78,44 @@ __kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int
    }
 }

+#ifdef SUPPORTS_64_BIT_ATOMICS
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+
+__kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global long* pmeGrid, __global float4* pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
+    int ix = get_local_id(0)/(PME_ORDER*PME_ORDER);
+    int remainder = get_local_id(0)-ix*PME_ORDER*PME_ORDER;
+    int iy = remainder/PME_ORDER;
+    int iz = remainder-iy*PME_ORDER;
+    if (ix < PME_ORDER) {
+        for (int atomIndex = get_group_id(0); atomIndex < NUM_ATOMS; atomIndex += get_num_groups(0)) {
+            float4 pos = posq[atomIndex];
+            float atomCharge = pos.w;
+            float add = atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
+            pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
+            pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
+            pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
+            int x = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X)+ix;
+            int y = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y)+iy;
+            int z = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z)+iz;
+            x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0);
+            y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
+            z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
+            atom_add(&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], (long) (add*0xFFFFFFFF));
+        }
+    }
+}
+
+__kernel void finishSpreadCharge(__global long* pmeGrid) {
+    __global float2* floatGrid = (__global float2*) pmeGrid;
+    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
+    float scale = EPSILON_FACTOR/(float) 0xFFFFFFFF;
+    for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
+        long value = pmeGrid[index];
+        float2 floatValue = (float2) ((float) (value*scale), 0.0f);
+        floatGrid[index] = floatValue;
+    }
+}
+#else
 __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float2* pmeGrid, __global float4* pmeBsplineTheta) {
    unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
@@ -91,7 +129,7 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
        float result = 0.0f;

        // Loop over all atoms that affect this grid point.
-        
+
        for (int ix = 0; ix < PME_ORDER; ++ix) {
            int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
            for (int iy = 0; iy < PME_ORDER; ++iy) {
@@ -133,6 +171,7 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
        pmeGrid[gridIndex] = (float2) (result*EPSILON_FACTOR, 0.0f);
    }
 }
+#endif

 __kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* energyBuffer, __global float* pmeBsplineModuliX,
        __global float* pmeBsplineModuliY, __global float* pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) {