Optimizations on AMD suggested by Tony Tye

13ef0ee8 · Peter Eastman · e68471ec · 13ef0ee8 · 13ef0ee8 · 13ef0ee8
Commit 13ef0ee8 authored May 09, 2011 by Peter Eastman
3 changed files
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -95,6 +95,13 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
            compilationOptions += " -DWARPS_ARE_ATOMIC";
            simdWidth = 32;
        }
+        else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") {
+            // AMD APP SDK 2.4 has a performance problem with atomics. Enable the work around.
+            compilationOptions += " -DAMD_ATOMIC_WORK_AROUND";
+            // AMD has both 32 and 64 width SIMDs. To determine need to create a kernel to query.
+            // For now default to 1 which will use the default kernels.
+            simdWidth = 1;
+        }
        else
            simdWidth = 1;
        queue = cl::CommandQueue(context, device);

--- a/platforms/opencl/src/kernels/findInteractingBlocks.cl
+++ b/platforms/opencl/src/kernels/findInteractingBlocks.cl
@@ -90,7 +90,7 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
    __local int* flag = sum;
    int lasty = -1;
    float4 center, boxSize, pos;
-    for (tile = 0; tile < numValid; tile++) {
+    for (tile = 0; tile < numValid; ) {
        int x = temp[tile].x;
        int y = temp[tile].y;
        if (x == y) {
@@ -167,6 +167,14 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
    __local ushort2 temp[BUFFER_SIZE];
    __local int bufferFull;
    __local int globalIndex;
+#ifdef AMD_ATOMIC_WORK_AROUND
+    // Do a byte write to force all memory accesses to interactionCount to use the complete path.
+    // This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
+    // The IF actually causes the write to never be executed, its presence is all that is needed.
+    // AMD APP SDK 2.4 has this problem.
+    if (get_global_id(0) == get_local_id(0)+1)
+        ((__global char*)interactionCount)[sizeof(unsigned int)+1] = 0;
+#endif
    int valuesInBuffer = 0;
    if (get_local_id(0) == 0)
        bufferFull = false;

--- a/platforms/opencl/src/kernels/sort.cl
+++ b/platforms/opencl/src/kernels/sort.cl
@@ -47,6 +47,14 @@ __kernel void computeRange(__global TYPE* data, int length, __global float2* ran
 */
 __kernel void assignElementsToBuckets(__global TYPE* data, int length, int numBuckets, __global float2* range,
        __global int* bucketOffset, __global int* bucketOfElement, __global int* offsetInBucket) {
+#ifdef AMD_ATOMIC_WORK_AROUND
+    // Do a byte write to force all memory accesses to interactionCount to use the complete path.
+    // This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
+    // The IF actually causes the write to never be executed, its presence is all that is needed.
+    // AMD APP SDK 2.4 has this problem.
+    if (get_global_id(0) == get_local_id(0)+1)
+        ((__global char*)bucketOffset)[sizeof(int)*numBuckets+1] = 0;
+#endif
    float2 dataRange = range[0];
    float minValue = dataRange.x;
    float maxValue = dataRange.y;