Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
13ef0ee8
Commit
13ef0ee8
authored
May 09, 2011
by
Peter Eastman
Browse files
Optimizations on AMD suggested by Tony Tye
parent
e68471ec
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
24 additions
and
1 deletion
+24
-1
platforms/opencl/src/OpenCLContext.cpp
platforms/opencl/src/OpenCLContext.cpp
+7
-0
platforms/opencl/src/kernels/findInteractingBlocks.cl
platforms/opencl/src/kernels/findInteractingBlocks.cl
+9
-1
platforms/opencl/src/kernels/sort.cl
platforms/opencl/src/kernels/sort.cl
+8
-0
No files found.
platforms/opencl/src/OpenCLContext.cpp
View file @
13ef0ee8
...
@@ -95,6 +95,13 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
...
@@ -95,6 +95,13 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
compilationOptions
+=
" -DWARPS_ARE_ATOMIC"
;
compilationOptions
+=
" -DWARPS_ARE_ATOMIC"
;
simdWidth
=
32
;
simdWidth
=
32
;
}
}
else
if
(
vendor
.
size
()
>=
28
&&
vendor
.
substr
(
0
,
28
)
==
"Advanced Micro Devices, Inc."
)
{
// AMD APP SDK 2.4 has a performance problem with atomics. Enable the work around.
compilationOptions
+=
" -DAMD_ATOMIC_WORK_AROUND"
;
// AMD has both 32 and 64 width SIMDs. To determine need to create a kernel to query.
// For now default to 1 which will use the default kernels.
simdWidth
=
1
;
}
else
else
simdWidth
=
1
;
simdWidth
=
1
;
queue
=
cl
::
CommandQueue
(
context
,
device
);
queue
=
cl
::
CommandQueue
(
context
,
device
);
...
...
platforms/opencl/src/kernels/findInteractingBlocks.cl
View file @
13ef0ee8
...
@@ -90,7 +90,7 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
...
@@ -90,7 +90,7 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
__local
int*
flag
=
sum
;
__local
int*
flag
=
sum
;
int
lasty
=
-1
;
int
lasty
=
-1
;
float4
center,
boxSize,
pos
;
float4
center,
boxSize,
pos
;
for
(
tile
=
0
; tile < numValid;
tile++
) {
for
(
tile
=
0
; tile < numValid; ) {
int
x
=
temp[tile].x
;
int
x
=
temp[tile].x
;
int
y
=
temp[tile].y
;
int
y
=
temp[tile].y
;
if
(
x
==
y
)
{
if
(
x
==
y
)
{
...
@@ -167,6 +167,14 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
...
@@ -167,6 +167,14 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
__local ushort2 temp[BUFFER_SIZE];
__local ushort2 temp[BUFFER_SIZE];
__local int bufferFull;
__local int bufferFull;
__local int globalIndex;
__local int globalIndex;
#ifdef AMD_ATOMIC_WORK_AROUND
// Do a byte write to force all memory accesses to interactionCount to use the complete path.
// This avoids the atomic access from causing all word accesses to other buffers from using the slow complete path.
// The IF actually causes the write to never be executed, its presence is all that is needed.
// AMD APP SDK 2.4 has this problem.
if (get_global_id(0) == get_local_id(0)+1)
((__global char*)interactionCount)[sizeof(unsigned int)+1] = 0;
#endif
int valuesInBuffer = 0;
int valuesInBuffer = 0;
if (get_local_id(0) == 0)
if (get_local_id(0) == 0)
bufferFull = false;
bufferFull = false;
...
...
platforms/opencl/src/kernels/sort.cl
View file @
13ef0ee8
...
@@ -47,6 +47,14 @@ __kernel void computeRange(__global TYPE* data, int length, __global float2* ran
...
@@ -47,6 +47,14 @@ __kernel void computeRange(__global TYPE* data, int length, __global float2* ran
*/
*/
__kernel
void
assignElementsToBuckets
(
__global
TYPE*
data,
int
length,
int
numBuckets,
__global
float2*
range,
__kernel
void
assignElementsToBuckets
(
__global
TYPE*
data,
int
length,
int
numBuckets,
__global
float2*
range,
__global
int*
bucketOffset,
__global
int*
bucketOfElement,
__global
int*
offsetInBucket
)
{
__global
int*
bucketOffset,
__global
int*
bucketOfElement,
__global
int*
offsetInBucket
)
{
#
ifdef
AMD_ATOMIC_WORK_AROUND
//
Do
a
byte
write
to
force
all
memory
accesses
to
interactionCount
to
use
the
complete
path.
//
This
avoids
the
atomic
access
from
causing
all
word
accesses
to
other
buffers
from
using
the
slow
complete
path.
//
The
IF
actually
causes
the
write
to
never
be
executed,
its
presence
is
all
that
is
needed.
//
AMD
APP
SDK
2.4
has
this
problem.
if
(
get_global_id
(
0
)
==
get_local_id
(
0
)
+1
)
((
__global
char*
)
bucketOffset
)
[sizeof
(
int
)
*numBuckets+1]
=
0
;
#
endif
float2
dataRange
=
range[0]
;
float2
dataRange
=
range[0]
;
float
minValue
=
dataRange.x
;
float
minValue
=
dataRange.x
;
float
maxValue
=
dataRange.y
;
float
maxValue
=
dataRange.y
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment