Commit 13ef0ee8 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations on AMD suggested by Tony Tye

parent e68471ec
......@@ -95,6 +95,13 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
compilationOptions += " -DWARPS_ARE_ATOMIC";
simdWidth = 32;
}
else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") {
// AMD APP SDK 2.4 has a performance problem with atomics. Enable the work around.
compilationOptions += " -DAMD_ATOMIC_WORK_AROUND";
// AMD has both 32 and 64 width SIMDs. To determine need to create a kernel to query.
// For now default to 1 which will use the default kernels.
simdWidth = 1;
}
else
simdWidth = 1;
queue = cl::CommandQueue(context, device);
......
......@@ -90,7 +90,7 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
__local int* flag = sum;
int lasty = -1;
float4 center, boxSize, pos;
for (tile = 0; tile < numValid; tile++) {
for (tile = 0; tile < numValid; ) {
int x = temp[tile].x;
int y = temp[tile].y;
if (x == y) {
......@@ -167,6 +167,14 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
__local ushort2 temp[BUFFER_SIZE];
__local int bufferFull;
__local int globalIndex;
#ifdef AMD_ATOMIC_WORK_AROUND
// Do a byte write to force all memory accesses to interactionCount to use the complete path.
// This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
// The IF actually causes the write to never be executed, its presence is all that is needed.
// AMD APP SDK 2.4 has this problem.
if (get_global_id(0) == get_local_id(0)+1)
((__global char*)interactionCount)[sizeof(unsigned int)+1] = 0;
#endif
int valuesInBuffer = 0;
if (get_local_id(0) == 0)
bufferFull = false;
......
......@@ -47,6 +47,14 @@ __kernel void computeRange(__global TYPE* data, int length, __global float2* ran
*/
__kernel void assignElementsToBuckets(__global TYPE* data, int length, int numBuckets, __global float2* range,
__global int* bucketOffset, __global int* bucketOfElement, __global int* offsetInBucket) {
#ifdef AMD_ATOMIC_WORK_AROUND
// Do a byte write to force all memory accesses to interactionCount to use the complete path.
// This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
// The IF actually causes the write to never be executed, its presence is all that is needed.
// AMD APP SDK 2.4 has this problem.
if (get_global_id(0) == get_local_id(0)+1)
((__global char*)bucketOffset)[sizeof(int)*numBuckets+1] = 0;
#endif
float2 dataRange = range[0];
float minValue = dataRange.x;
float maxValue = dataRange.y;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment