Finished OpenCL implementation of interaction groups

4ea10969 · peastman · 54c0ca3f · 4ea10969 · 4ea10969
Commit 4ea10969 authored Aug 08, 2013 by peastman
Showing with 52 additions and 6 deletions

platforms/opencl/src/OpenCLKernels.cpp platforms/opencl/src/OpenCLKernels.cpp +14 -5

platforms/opencl/src/kernels/customNonbondedGroups.cl platforms/opencl/src/kernels/customNonbondedGroups.cl +38 -1

No files found.
--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -2178,11 +2178,19 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
    vector<mm_int4> groupData;
    for (int tileSet = 0; tileSet < numTileSets; tileSet++) {
        int indexInTileSet = 0;
+        int minSize = 0;
+        if (cl.getSIMDWidth() < 32) {
+            // We need to include a barrier inside the inner loop, so ensure that all
+            // threads will loop the same number of times.
+            
+            for (int i = tileSetStart[tileSet]; i < tileSetStart[tileSet+1]; i++)
+                minSize = max(minSize, (int) atomLists[tiles[tileOrder[i].second].first].size());
+        }
        for (int i = tileSetStart[tileSet]; i < tileSetStart[tileSet+1]; i++) {
            int tile = tileOrder[i].second;
            vector<int>& atoms1 = atomLists[tiles[tile].first];
            vector<int>& atoms2 = atomLists[tiles[tile].second];
-            int range = indexInTileSet + ((indexInTileSet+atoms1.size())<<16);
+            int range = indexInTileSet + ((indexInTileSet+max(minSize, (int) atoms1.size()))<<16);
            int allFlags = (1<<atoms2.size())-1;
            for (int j = 0; j < (int) atoms1.size(); j++) {
                int a1 = atoms1[j];
@@ -2193,7 +2201,7 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
            indexInTileSet += atoms1.size();
        }
        for (; indexInTileSet < 32; indexInTileSet++)
-            groupData.push_back(mm_int4(0, 0, 0, 0));
+            groupData.push_back(mm_int4(0, 0, minSize<<16, 0));
    }
    interactionGroupData = OpenCLArray::create<mm_int4>(cl, groupData.size(), "interactionGroupData");
    interactionGroupData->upload(groupData);
@@ -2242,7 +2250,7 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
        if (buffers[i].getNumComponents() == 1)
            load2<<buffers[i].getType()<<" params"<<(i+1)<<"2 = localData[localIndex].params"<<(i+1)<<";\n";
        else {
-            load2<<buffers[i].getType()<<" params"<<(i+1)<<"2 = make_"<<buffers[i].getType()<<"(";
+            load2<<buffers[i].getType()<<" params"<<(i+1)<<"2 = ("<<buffers[i].getType()<<") (";
            for (int j = 0; j < buffers[i].getNumComponents(); ++j) {
                if (j > 0)
                    load2<<", ";
@@ -2299,7 +2307,8 @@ double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
        if (!hasInitializedKernel) {
            hasInitializedKernel = true;
            int index = 0;
-            interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getLongForceBuffer().getDeviceBuffer());
+            bool useLong = cl.getSupports64BitGlobalAtomics();
+            interactionGroupKernel.setArg<cl::Buffer>(index++, (useLong ? cl.getLongForceBuffer() : cl.getForceBuffers()).getDeviceBuffer());
            interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
            interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
            interactionGroupKernel.setArg<cl::Buffer>(index++, interactionGroupData->getDeviceBuffer());
@@ -2310,7 +2319,7 @@ double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
            if (globals != NULL)
                interactionGroupKernel.setArg<cl::Buffer>(index++, globals->getDeviceBuffer());
        }
-        int forceThreadBlockSize = cl.getNonbondedUtilities().getForceThreadBlockSize();
+        int forceThreadBlockSize = max(32, cl.getNonbondedUtilities().getForceThreadBlockSize());
        cl.executeKernel(interactionGroupKernel, numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
    }
    mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();

--- a/platforms/opencl/src/kernels/customNonbondedGroups.cl
+++ b/platforms/opencl/src/kernels/customNonbondedGroups.cl
@@ -14,8 +14,37 @@ typedef struct {
 #endif
 } AtomData;

+/**
+ * This function is used on devices that don't support 64 bit atomics.  Multiple threads within
+ * a single tile might have computed forces on the same atom.  This loops over them and makes sure
+ * that only one thread updates the force on any given atom.
+ */
+void writeForces(__global real4* forceBuffers,__local AtomData* localData, int atomIndex) {
+    localData[get_local_id(0)].x = atomIndex;
+    SYNC_WARPS;
+    real4 forceSum = (real4) 0;
+    int start = (get_local_id(0)/TILE_SIZE)*TILE_SIZE;
+    int end = start+32;
+    bool isFirst = true;
+    for (int i = start; i < end; i++)
+        if (localData[i].x == atomIndex) {
+            forceSum += (real4) (localData[i].fx, localData[i].fy, localData[i].fz, 0);
+            isFirst &= (i >= get_local_id(0));
+        }
+    const unsigned int warp = get_global_id(0)/TILE_SIZE;
+    unsigned int offset = atomIndex + warp*PADDED_NUM_ATOMS;
+    if (isFirst)
+        forceBuffers[offset] += forceSum;
+    SYNC_WARPS;
+}
+
 __kernel void computeInteractionGroups(
-        __global long* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq,
+#ifdef SUPPORTS_64_BIT_ATOMICS
+        __global long* restrict forceBuffers,
+#else
+        __global real4* restrict forceBuffers,
+#endif
+        __global real* restrict energyBuffer, __global const real4* restrict posq,
        __global const int4* restrict groupData, real4 periodicBoxSize, real4 invPeriodicBoxSize
        PARAMETER_ARGUMENTS) {
    const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
@@ -78,6 +107,7 @@ __kernel void computeInteractionGroups(
            tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
            SYNC_WARPS;
        }
+#ifdef SUPPORTS_64_BIT_ATOMICS
        if (exclusions != 0) {
            atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
            atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
@@ -86,6 +116,13 @@ __kernel void computeInteractionGroups(
            atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
            atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
        }
+#else
+        writeForces(forceBuffers, localData, atom2);
+        localData[get_local_id(0)].fx = force.x;
+        localData[get_local_id(0)].fy = force.y;
+        localData[get_local_id(0)].fz = force.z;
+        writeForces(forceBuffers, localData, atom1);
+#endif
    }
    energyBuffer[get_global_id(0)] += energy;
 }
\ No newline at end of file