Created OpenCL implementation of NonbondedForce that works with non-Nvidia processors

e43f14bf · Peter Eastman · 2529b922 · e43f14bf · e43f14bf · e43f14bf
Commit e43f14bf authored Oct 21, 2009 by Peter Eastman
5 changed files
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -64,8 +64,12 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste
            throw OpenMMException("The specified OpenCL device is not compatible with OpenMM");
        compilationOptions = "-cl-fast-relaxed-math";
        string vendor = device.getInfo<CL_DEVICE_VENDOR>();
-        if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA")
+        if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") {
            compilationOptions += " -DWARPS_ARE_ATOMIC";
+            simdWidth = 32;
+        }
+        else
+            simdWidth = 1;
        queue = cl::CommandQueue(context, device);
        numAtoms = numParticles;
        paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize);

--- a/platforms/opencl/src/OpenCLContext.h
+++ b/platforms/opencl/src/OpenCLContext.h
@@ -251,6 +251,12 @@ public:
    int getNumForceBuffers() const {
        return numForceBuffers;
    }
+    /**
+     * Get the SIMD width of the device being used.
+     */
+    int getSIMDWidth() const {
+        return simdWidth;
+    }
    /**
     * Get the OpenCLIntegrationUtilities for this context.
     */
@@ -281,6 +287,7 @@ private:
    int numAtomBlocks;
    int numThreadBlocks;
    int numForceBuffers;
+    int simdWidth;
    std::string compilationOptions;
    cl::Context context;
    cl::Device device;

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -346,7 +346,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
    padded << context.getPaddedNumAtoms();
    defines["NUM_ATOMS"] = natom.str();
    defines["PADDED_NUM_ATOMS"] = padded.str();
-    cl::Program program = context.createProgram(context.loadSourceFromFile("nonbonded.cl", replacements), defines);
+    string filename = (context.getSIMDWidth() == 32 ? "nonbonded_nvidia.cl" : "nonbonded_default.cl");
+    cl::Program program = context.createProgram(context.loadSourceFromFile(filename, replacements), defines);
    cl::Kernel kernel(program, "computeNonbonded");
    // Set arguments to the Kernel.
@@ -358,17 +359,17 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
    kernel.setArg<cl::Buffer>(4, exclusionIndex->getDeviceBuffer());
    kernel.setArg(5, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
    kernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
-    int paramBase = 9;
+    kernel.setArg(7, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
+    int paramBase = 10;
    if (useCutoff) {
        paramBase = 11;
-        kernel.setArg<cl::Buffer>(7, interactingTiles->getDeviceBuffer());
+        kernel.setArg<cl::Buffer>(8, interactingTiles->getDeviceBuffer());
-        kernel.setArg<cl::Buffer>(8, interactionFlags->getDeviceBuffer());
+        kernel.setArg<cl::Buffer>(9, interactionFlags->getDeviceBuffer());
-        kernel.setArg<cl::Buffer>(9, interactionCount->getDeviceBuffer());
+        kernel.setArg<cl::Buffer>(10, interactionCount->getDeviceBuffer());
-        kernel.setArg(10, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
    }
    else {
-        kernel.setArg<cl::Buffer>(7, tiles->getDeviceBuffer());
+        kernel.setArg<cl::Buffer>(8, tiles->getDeviceBuffer());
-        kernel.setArg<cl_uint>(8, tiles->getSize());
+        kernel.setArg<cl_uint>(9, tiles->getSize());
    }
    for (int i = 0; i < (int) params.size(); i++) {
        kernel.setArg<cl::Buffer>(i*2+paramBase, params[i].getBuffer());

--- a/platforms/opencl/src/kernels/nonbonded_default.cl
+++ b/platforms/opencl/src/kernels/nonbonded_default.cl
+const unsigned int TileSize = 32;
+/**
+ * Compute nonbonded interactions.
+ */
+__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
+        __global unsigned int* exclusionIndices, __local float4* local_posq, __local float4* local_force, __local float4* tempBuffer, __global unsigned int* tiles,
+#ifdef USE_CUTOFF
+        __global unsigned int* interactionFlags, __global unsigned int* interactionCount
+#else
+        unsigned int numTiles
+#endif
+        PARAMETER_ARGUMENTS) {
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+#endif
+    unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
+    unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
+    float energy = 0.0f;
+    unsigned int lasty = 0xFFFFFFFF;
+    while (pos < end) {
+        // Extract the coordinates of this tile
+        unsigned int x = tiles[pos];
+        unsigned int y = ((x >> 2) & 0x7fff)*TileSize;
+        bool hasExclusions = (x & 0x1);
+        x = (x>>17)*TileSize;
+        unsigned int baseLocalAtom = (get_local_id(0) < TileSize ? 0 : TileSize/2);
+        unsigned int tgx = get_local_id(0) & (TileSize-1);
+        unsigned int forceBufferOffset = (tgx < TileSize/2 ? 0 : TileSize);
+        unsigned int atom1 = x + tgx;
+        float4 force = 0.0f;
+        float4 posq1 = posq[atom1];
+        LOAD_ATOM1_PARAMETERS
+        if (x == y) {
+            // This tile is on the diagonal.
+            local_posq[get_local_id(0)] = posq1;
+            LOAD_LOCAL_PARAMETERS_FROM_1
+            barrier(CLK_LOCAL_MEM_FENCE);
+            unsigned int xi = x/TileSize;
+            unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TileSize-xi*(xi+1)/2;
+#ifdef USE_EXCLUSIONS
+            unsigned int excl = exclusions[exclusionIndices[tile]+tgx] >> baseLocalAtom;
+#endif
+            for (unsigned int j = 0; j < TileSize/2; j++) {
+#ifdef USE_EXCLUSIONS
+                bool isExcluded = !(excl & 0x1);
+#endif
+                int atom2 = baseLocalAtom+j;
+                float4 posq2 = local_posq[atom2];
+                float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+#ifdef USE_PERIODIC
+                delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
+                delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
+                delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
+#endif
+                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                float r = sqrt(r2);
+                float invR = 1.0f/r;
+                LOAD_ATOM2_PARAMETERS
+                atom2 = y+baseLocalAtom+j;
+                float dEdR = 0.0f;
+                float tempEnergy = 0.0f;
+                COMPUTE_INTERACTION
+                energy += 0.5f*tempEnergy;
+                delta.xyz *= dEdR;
+                force.xyz -= delta.xyz;
+                excl >>= 1;
+            }
+            // Sum the forces and write results.
+            if (get_local_id(0) >= TileSize)
+                tempBuffer[get_local_id(0)] = force;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if (get_local_id(0) < TileSize) {
+#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
+                unsigned int offset = x + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
+#else
+                unsigned int offset = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+#endif
+                forceBuffers[offset].xyz += force.xyz+tempBuffer[get_local_id(0)+TileSize].xyz;
+            }
+        }
+        else {
+            // This is an off-diagonal tile.
+            if (lasty != y && get_local_id(0) < TileSize) {
+                unsigned int j = y + tgx;
+                local_posq[get_local_id(0)] = posq[j];
+                LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+            }
+            local_force[get_local_id(0)] = 0.0f;
+            barrier(CLK_LOCAL_MEM_FENCE);
+#ifdef USE_CUTOFF
+            unsigned int flags = interactionFlags[pos];
+            if (!hasExclusions && flags == 0) {
+                // No interactions in this tile.
+            }
+            else
+#endif
+            {
+                // Compute the full set of interactions in this tile.
+                unsigned int xi = x/TileSize;
+                unsigned int yi = y/TileSize;
+                unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TileSize-yi*(yi+1)/2;
+#ifdef USE_EXCLUSIONS
+                unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF);
+                excl = (excl >> tgx) | (excl << (TileSize - tgx));
+                excl >>= baseLocalAtom;
+#endif
+                unsigned int tj = tgx%(TileSize/2);
+                for (unsigned int j = 0; j < TileSize/2; j++) {
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = !(excl & 0x1);
+#endif
+                    int atom2 = baseLocalAtom+tj;
+                    float4 posq2 = local_posq[atom2];
+                    float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
+#ifdef USE_PERIODIC
+                    delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
+                    delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
+                    delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
+#endif
+                    float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                    float r = sqrt(r2);
+                    float invR = 1.0f/r;
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = y+baseLocalAtom+tj;
+                    float dEdR = 0.0f;
+                    float tempEnergy = 0.0f;
+                    COMPUTE_INTERACTION
+		    energy += tempEnergy;
+                    delta.xyz *= dEdR;
+                    force.xyz -= delta.xyz;
+                    local_force[baseLocalAtom+tj+forceBufferOffset].xyz += delta.xyz;
+                    barrier(CLK_LOCAL_MEM_FENCE);
+                    excl >>= 1;
+                    tj = (tj+1)%(TileSize/2);
+                }
+            }
+            // Sum the forces and write results.
+            if (get_local_id(0) >= TileSize)
+                tempBuffer[get_local_id(0)] = force;
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if (get_local_id(0) < TileSize) {
+#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
+                unsigned int offset1 = x + tgx + (y/TileSize)*PADDED_NUM_ATOMS;
+                unsigned int offset2 = y + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
+#else
+                unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+                unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
+#endif
+                forceBuffers[offset1].xyz += force.xyz+tempBuffer[get_local_id(0)+TileSize].xyz;
+                forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz+local_force[get_local_id(0)+TileSize].xyz;
+            }
+            lasty = y;
+        }
+        pos++;
+    }
+    energyBuffer[get_global_id(0)] += energy;
+}
--- a/platforms/opencl/src/kernels/nonbonded.cl
+++ b/platforms/opencl/src/kernels/nonbonded.cl
@@ -4,10 +4,10 @@ const unsigned int TileSize = 32;
 * Compute nonbonded interactions.
 */
-__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq,
+__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
-        __global unsigned int* exclusions,  __global unsigned int* exclusionIndices, __local float4* local_posq, __local float4* local_force, __global unsigned int* tiles,
+        __global unsigned int* exclusionIndices, __local float4* local_posq, __local float4* local_force, __local float4* tempBuffer, __global unsigned int* tiles,
 #ifdef USE_CUTOFF
-        __global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local float4* tempBuffer
+        __global unsigned int* interactionFlags, __global unsigned int* interactionCount
 #else
        unsigned int numTiles
 #endif
@@ -30,7 +30,6 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
        x = (x>>17)*TileSize;
        unsigned int tgx = get_local_id(0) & (TileSize-1);
        unsigned int tbx = get_local_id(0) - tgx;
-        unsigned int tj = tgx;
        unsigned int atom1 = x + tgx;
        float4 force = 0.0f;
        float4 posq1 = posq[atom1];
@@ -49,7 +48,9 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
 #ifdef USE_EXCLUSIONS
                bool isExcluded = !(excl & 0x1);
 #endif
-                float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
+                int atom2 = tbx+j;
+                float4 posq2 = local_posq[atom2];
+                float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
                delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
                delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
@@ -58,8 +59,6 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                float r = sqrt(r2);
                float invR = 1.0f/r;
-                int atom2 = tbx+j;
-                float4 posq2 = local_posq[atom2];
                LOAD_ATOM2_PARAMETERS
                atom2 = y+j;
                float dEdR = 0.0f;
@@ -100,7 +99,9 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                    for (unsigned int j = 0; j < TileSize; j++) {
                        if ((flags&(1<<j)) != 0) {
                            bool isExcluded = false;
-                            float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f);
+                            int atom2 = tbx+j;
+                            float4 posq2 = local_posq[atom2];
+                            float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
                            delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
                            delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
@@ -109,8 +110,6 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                            float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                            float r = sqrt(r2);
                            float invR = 1.0f/r;
-                            int atom2 = tbx+j;
-                            float4 posq2 = local_posq[atom2];
                            LOAD_ATOM2_PARAMETERS
                            atom2 = y+j;
                            float dEdR = 0.0f;
@@ -149,11 +148,14 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF);
                excl = (excl >> tgx) | (excl << (TileSize - tgx));
 #endif
+                unsigned int tj = tgx;
                for (unsigned int j = 0; j < TileSize; j++) {
 #ifdef USE_EXCLUSIONS
                    bool isExcluded = !(excl & 0x1);
 #endif
-                    float4 delta = (float4) (local_posq[tbx+tj].xyz - posq1.xyz, 0.0f);
+                    int atom2 = tbx+tj;
+                    float4 posq2 = local_posq[atom2];
+                    float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
 #ifdef USE_PERIODIC
                    delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
                    delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
@@ -162,8 +164,6 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
                    float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
                    float r = sqrt(r2);
                    float invR = 1.0f/r;
-                    int atom2 = tbx+tj;
-                    float4 posq2 = local_posq[atom2];
                    LOAD_ATOM2_PARAMETERS
                    atom2 = y+tj;
                    float dEdR = 0.0f;