Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
7b67c27b
Commit
7b67c27b
authored
May 01, 2015
by
peastman
Browse files
Merge pull request #890 from peastman/amdinteractions
Optimization for AMD
parents
af51d751
460f016e
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
284 additions
and
4 deletions
+284
-4
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+1
-0
platforms/opencl/src/kernels/findInteractingBlocks.cl
platforms/opencl/src/kernels/findInteractingBlocks.cl
+283
-4
No files found.
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
7b67c27b
...
...
@@ -303,6 +303,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
defines
[
"PADDED_CUTOFF_SQUARED"
]
=
context
.
doubleToString
(
paddedCutoff
*
paddedCutoff
);
defines
[
"NUM_TILES_WITH_EXCLUSIONS"
]
=
context
.
intToString
(
exclusionTiles
->
getSize
());
defines
[
"NUM_BLOCKS"
]
=
context
.
intToString
(
context
.
getNumAtomBlocks
());
defines
[
"SIMD_WIDTH"
]
=
context
.
intToString
(
context
.
getSIMDWidth
());
if
(
usePeriodic
)
defines
[
"USE_PERIODIC"
]
=
"1"
;
int
maxExclusions
=
0
;
...
...
platforms/opencl/src/kernels/findInteractingBlocks.cl
View file @
7b67c27b
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_byte_addressable_store
:
enable
#
define
BUFFER_SIZE
256
/**
*
Find
a
bounding
box
for
the
atoms
in
each
block.
...
...
@@ -65,6 +64,10 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
}
}
#
if
SIMD_WIDTH
<=
32
#
define
BUFFER_SIZE
256
__kernel
void
findBlocksWithInteractions
(
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ,
__global
unsigned
int*
restrict
interactionCount,
__global
int*
restrict
interactingTiles,
__global
unsigned
int*
restrict
interactingAtoms,
__global
const
real4*
restrict
posq,
unsigned
int
maxTiles,
unsigned
int
startBlockIndex,
unsigned
int
numBlocks,
__global
real2*
restrict
sortedBlocks,
...
...
@@ -133,8 +136,8 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
int
block2
=
block2Base+indexInWarp
;
bool
includeBlock2
=
(
block2
<
NUM_BLOCKS
)
;
if
(
includeBlock2
)
{
real4
blockCenterY
=
(
block2
<
NUM_BLOCKS
?
sortedBlockCenter[block2]
:
(
real4
)
(
0
))
;
real4
blockSizeY
=
(
block2
<
NUM_BLOCKS
?
sortedBlockBoundingBox[block2]
:
(
real4
)
(
0
))
;
real4
blockCenterY
=
sortedBlockCenter[block2]
;
real4
blockSizeY
=
sortedBlockBoundingBox[block2]
;
real4
blockDelta
=
blockCenterX-blockCenterY
;
#
ifdef
USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
blockDelta
)
...
...
@@ -252,3 +255,279 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0))
oldPositions[i]
=
posq[i]
;
}
#
else
//
This
is
the
old
implementation
of
finding
interacting
blocks.
It
is
quite
a
bit
more
complicated,
//
and
slower
on
most
GPUs.
On
AMD,
however,
it
is
faster,
so
we
keep
it
around
to
use
there.
#
define
BUFFER_SIZE
BUFFER_GROUPS*GROUP_SIZE
#
define
WARP_SIZE
32
#
define
INVALID
0xFFFF
/**
*
Perform
a
parallel
prefix
sum
over
an
array.
The
input
values
are
all
assumed
to
be
0
or
1.
*/
void
prefixSum
(
__local
short*
sum,
__local
ushort2*
temp
)
{
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
temp[i].x
=
sum[i]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
whichBuffer
=
0
;
for
(
int
offset
=
1
; offset < BUFFER_SIZE; offset *= 2) {
if
(
whichBuffer
==
0
)
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
temp[i].y
=
(
i
<
offset
?
temp[i].x
:
temp[i].x+temp[i-offset].x
)
;
else
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
temp[i].x
=
(
i
<
offset
?
temp[i].y
:
temp[i].y+temp[i-offset].y
)
;
whichBuffer
=
1-whichBuffer
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
whichBuffer
==
0
)
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
sum[i]
=
temp[i].x
;
else
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
sum[i]
=
temp[i].y
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
/**
*
This
is
called
by
findBlocksWithInteractions
()
.
It
compacts
the
list
of
blocks,
identifies
interactions
*
in
them,
and
writes
the
result
to
global
memory.
*/
void
storeInteractionData
(
int
x,
__local
unsigned
short*
buffer,
__local
short*
sum,
__local
ushort2*
temp,
__local
int*
atoms,
__local
int*
numAtoms,
__local
int*
baseIndex,
__global
unsigned
int*
interactionCount,
__global
int*
interactingTiles,
__global
unsigned
int*
interactingAtoms,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ,
__global
const
real4*
posq,
__local
real4*
posBuffer,
real4
blockCenterX,
real4
blockSizeX,
unsigned
int
maxTiles,
bool
finish
)
{
const
bool
singlePeriodicCopy
=
(
0.5f*periodicBoxSize.x-blockSizeX.x
>=
PADDED_CUTOFF
&&
0.5f*periodicBoxSize.y-blockSizeX.y
>=
PADDED_CUTOFF
&&
0.5f*periodicBoxSize.z-blockSizeX.z
>=
PADDED_CUTOFF
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
real4
pos
=
posq[x*TILE_SIZE+get_local_id
(
0
)
]
;
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
{
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos,
blockCenterX
)
}
#
endif
posBuffer[get_local_id
(
0
)
]
=
pos
;
}
//
The
buffer
is
full,
so
we
need
to
compact
it
and
write
out
results.
Start
by
doing
a
parallel
prefix
sum.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
sum[i]
=
(
buffer[i]
==
INVALID
?
0
:
1
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
prefixSum
(
sum,
temp
)
;
int
numValid
=
sum[BUFFER_SIZE-1]
;
//
Compact
the
buffer.
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
if
(
buffer[i]
!=
INVALID
)
temp[sum[i]-1].x
=
buffer[i]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
buffer[i]
=
temp[i].x
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Loop
over
the
tiles
and
find
specific
interactions
in
them.
const
int
indexInWarp
=
get_local_id
(
0
)
%WARP_SIZE
;
for
(
int
base
=
0
; base < numValid; base += BUFFER_SIZE/WARP_SIZE) {
for
(
int
i
=
get_local_id
(
0
)
/WARP_SIZE
; i < BUFFER_SIZE/WARP_SIZE && base+i < numValid; i += GROUP_SIZE/WARP_SIZE) {
//
Check
each
atom
in
block
Y
for
interactions.
real4
pos
=
posq[buffer[base+i]*TILE_SIZE+indexInWarp]
;
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos,
blockCenterX
)
#
endif
bool
interacts
=
false
;
#
ifdef
USE_PERIODIC
if
(
!singlePeriodicCopy
)
{
for
(
int
j
=
0
; j < TILE_SIZE; j++) {
real4
delta
=
pos-posBuffer[j]
;
APPLY_PERIODIC_TO_DELTA
(
delta
)
interacts
|= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
}
}
else {
#endif
for (int j = 0; j < TILE_SIZE; j++) {
real4 delta = pos-posBuffer[j];
interacts |
=
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
PADDED_CUTOFF_SQUARED
)
;
}
#
ifdef
USE_PERIODIC
}
#
endif
sum[i*WARP_SIZE+indexInWarp]
=
(
interacts
?
1
:
0
)
;
}
for
(
int
i
=
numValid-base+get_local_id
(
0
)
/WARP_SIZE
; i < BUFFER_SIZE/WARP_SIZE; i += GROUP_SIZE/WARP_SIZE)
sum[i*WARP_SIZE+indexInWarp]
=
0
;
//
Compact
the
list
of
atoms.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
prefixSum
(
sum,
temp
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
if
(
sum[i]
!=
(
i
==
0
?
0
:
sum[i-1]
))
atoms[*numAtoms+sum[i]-1]
=
buffer[base+i/WARP_SIZE]*TILE_SIZE+indexInWarp
;
//
Store
them
to
global
memory.
int
atomsToStore
=
*numAtoms+sum[BUFFER_SIZE-1]
;
bool
storePartialTile
=
(
finish
&&
base
>=
numValid-BUFFER_SIZE/WARP_SIZE
)
;
int
tilesToStore
=
(
storePartialTile
?
(
atomsToStore+TILE_SIZE-1
)
/TILE_SIZE
:
atomsToStore/TILE_SIZE
)
;
if
(
tilesToStore
>
0
)
{
if
(
get_local_id
(
0
)
==
0
)
*baseIndex
=
atom_add
(
interactionCount,
tilesToStore
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
==
0
)
*numAtoms
=
atomsToStore-tilesToStore*TILE_SIZE
;
if
(
*baseIndex+tilesToStore
<=
maxTiles
)
{
if
(
get_local_id
(
0
)
<
tilesToStore
)
interactingTiles[*baseIndex+get_local_id
(
0
)
]
=
x
;
for
(
int
i
=
get_local_id
(
0
)
; i < tilesToStore*TILE_SIZE; i += get_local_size(0))
interactingAtoms[*baseIndex*TILE_SIZE+i]
=
(
i
<
atomsToStore
?
atoms[i]
:
NUM_ATOMS
)
;
}
}
else
{
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
==
0
)
*numAtoms
+=
sum[BUFFER_SIZE-1]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
*numAtoms
&&
!storePartialTile
)
atoms[get_local_id
(
0
)
]
=
atoms[tilesToStore*TILE_SIZE+get_local_id
(
0
)
]
;
}
if
(
numValid
==
0
&&
*numAtoms
>
0
&&
finish
)
{
//
We
didn
't
have
any
more
tiles
to
process,
but
there
were
some
atoms
left
over
from
a
//
previous
call
to
this
function.
Save
them
now.
if
(
get_local_id
(
0
)
==
0
)
*baseIndex
=
atom_add
(
interactionCount,
1
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
*baseIndex
<
maxTiles
)
{
if
(
get_local_id
(
0
)
==
0
)
interactingTiles[*baseIndex]
=
x
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
interactingAtoms[*baseIndex*TILE_SIZE+get_local_id
(
0
)
]
=
(
get_local_id
(
0
)
<
*numAtoms
?
atoms[get_local_id
(
0
)
]
:
NUM_ATOMS
)
;
}
}
//
Reset
the
buffer
for
processing
more
tiles.
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
buffer[i]
=
INVALID
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
/**
*
Compare
the
bounding
boxes
for
each
pair
of
blocks.
If
they
are
sufficiently
far
apart,
*
mark
them
as
non-interacting.
*/
__kernel
void
findBlocksWithInteractions
(
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ,
__global
unsigned
int*
restrict
interactionCount,
__global
int*
restrict
interactingTiles,
__global
unsigned
int*
restrict
interactingAtoms,
__global
const
real4*
restrict
posq,
unsigned
int
maxTiles,
unsigned
int
startBlockIndex,
unsigned
int
numBlocks,
__global
real2*
restrict
sortedBlocks,
__global
const
real4*
restrict
sortedBlockCenter,
__global
const
real4*
restrict
sortedBlockBoundingBox,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__global
real4*
restrict
oldPositions,
__global
const
int*
restrict
rebuildNeighborList
)
{
__local
unsigned
short
buffer[BUFFER_SIZE]
;
__local
short
sum[BUFFER_SIZE]
;
__local
ushort2
temp[BUFFER_SIZE]
;
__local
int
atoms[BUFFER_SIZE+TILE_SIZE]
;
__local
real4
posBuffer[TILE_SIZE]
;
__local
int
exclusionsForX[MAX_EXCLUSIONS]
;
__local
int
bufferFull
;
__local
int
globalIndex
;
__local
int
numAtoms
;
#
ifdef
AMD_ATOMIC_WORK_AROUND
//
Do
a
byte
write
to
force
all
memory
accesses
to
interactionCount
to
use
the
complete
path.
//
This
avoids
the
atomic
access
from
causing
all
word
accesses
to
other
buffers
from
using
the
slow
complete
path.
//
The
IF
actually
causes
the
write
to
never
be
executed,
its
presence
is
all
that
is
needed.
//
AMD
APP
SDK
2.4
has
this
problem.
if
(
get_global_id
(
0
)
==
get_local_id
(
0
)
+1
)
((
__global
char*
)
interactionCount
)
[sizeof
(
unsigned
int
)
+1]
=
0
;
#
endif
if
(
rebuildNeighborList[0]
==
0
)
return
; // The neighbor list doesn't need to be rebuilt.
int
valuesInBuffer
=
0
;
if
(
get_local_id
(
0
)
==
0
)
bufferFull
=
false
;
for
(
int
i
=
0
; i < BUFFER_GROUPS; ++i)
buffer[i*GROUP_SIZE+get_local_id
(
0
)
]
=
INVALID
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Loop
over
blocks
sorted
by
size.
for
(
int
i
=
startBlockIndex+get_group_id
(
0
)
; i < startBlockIndex+numBlocks; i += get_num_groups(0)) {
if
(
get_local_id
(
0
)
==
get_local_size
(
0
)
-1
)
numAtoms
=
0
;
real2
sortedKey
=
sortedBlocks[i]
;
int
x
=
(
int
)
sortedKey.y
;
real4
blockCenterX
=
sortedBlockCenter[i]
;
real4
blockSizeX
=
sortedBlockBoundingBox[i]
;
//
Load
exclusion
data
for
block
x.
const
int
exclusionStart
=
exclusionRowIndices[x]
;
const
int
exclusionEnd
=
exclusionRowIndices[x+1]
;
const
int
numExclusions
=
exclusionEnd-exclusionStart
;
for
(
int
j
=
get_local_id
(
0
)
; j < numExclusions; j += get_local_size(0))
exclusionsForX[j]
=
exclusionIndices[exclusionStart+j]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Compare
it
to
other
blocks
after
this
one
in
sorted
order.
for
(
int
base
=
i+1
; base < NUM_BLOCKS; base += get_local_size(0)) {
int
j
=
base+get_local_id
(
0
)
;
real2
sortedKey2
=
(
j
<
NUM_BLOCKS
?
sortedBlocks[j]
:
(
real2
)
0
)
;
real4
blockCenterY
=
(
j
<
NUM_BLOCKS
?
sortedBlockCenter[j]
:
(
real4
)
0
)
;
real4
blockSizeY
=
(
j
<
NUM_BLOCKS
?
sortedBlockBoundingBox[j]
:
(
real4
)
0
)
;
unsigned
short
y
=
(
unsigned
short
)
sortedKey2.y
;
real4
delta
=
blockCenterX-blockCenterY
;
#
ifdef
USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#
endif
delta.x
=
max
((
real
)
0
,
fabs
(
delta.x
)
-blockSizeX.x-blockSizeY.x
)
;
delta.y
=
max
((
real
)
0
,
fabs
(
delta.y
)
-blockSizeX.y-blockSizeY.y
)
;
delta.z
=
max
((
real
)
0
,
fabs
(
delta.z
)
-blockSizeX.z-blockSizeY.z
)
;
bool
hasExclusions
=
false
;
for
(
int
k
=
0
; k < numExclusions; k++)
hasExclusions
|
=
(
exclusionsForX[k]
==
y
)
;
if
(
j
<
NUM_BLOCKS
&&
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
PADDED_CUTOFF_SQUARED
&&
!hasExclusions
)
{
//
Add
this
tile
to
the
buffer.
int
bufferIndex
=
valuesInBuffer*GROUP_SIZE+get_local_id
(
0
)
;
buffer[bufferIndex]
=
y
;
valuesInBuffer++
;
if
(
!bufferFull
&&
valuesInBuffer
==
BUFFER_GROUPS
)
bufferFull
=
true
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
bufferFull
)
{
storeInteractionData
(
x,
buffer,
sum,
temp,
atoms,
&numAtoms,
&globalIndex,
interactionCount,
interactingTiles,
interactingAtoms,
periodicBoxSize,
invPeriodicBoxSize,
periodicBoxVecX,
periodicBoxVecY,
periodicBoxVecZ,
posq,
posBuffer,
blockCenterX,
blockSizeX,
maxTiles,
false
)
;
valuesInBuffer
=
0
;
if
(
get_local_id
(
0
)
==
0
)
bufferFull
=
false
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
storeInteractionData
(
x,
buffer,
sum,
temp,
atoms,
&numAtoms,
&globalIndex,
interactionCount,
interactingTiles,
interactingAtoms,
periodicBoxSize,
invPeriodicBoxSize,
periodicBoxVecX,
periodicBoxVecY,
periodicBoxVecZ,
posq,
posBuffer,
blockCenterX,
blockSizeX,
maxTiles,
true
)
;
}
//
Record
the
positions
the
neighbor
list
is
based
on.
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0))
oldPositions[i]
=
posq[i]
;
}
#
endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment