Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
8a42f4f6
Commit
8a42f4f6
authored
Apr 20, 2018
by
peastman
Browse files
Fixed incorrect synchronization on Volta for OpenCL
parent
9f92ae40
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
32 additions
and
11 deletions
+32
-11
platforms/cuda/src/kernels/customNonbondedGroups.cu
platforms/cuda/src/kernels/customNonbondedGroups.cu
+8
-7
platforms/opencl/src/kernels/customNonbondedGroups.cl
platforms/opencl/src/kernels/customNonbondedGroups.cl
+24
-4
No files found.
platforms/cuda/src/kernels/customNonbondedGroups.cu
View file @
8a42f4f6
...
...
@@ -8,18 +8,19 @@ typedef struct {
#endif
}
AtomData
;
// Find the maximum of a value across all threads in a warp, and return that to
// every thread. This is only needed on Volta and later. On earlier architectures,
// we can just return the value that was passed in.
/**
* Find the maximum of a value across all threads in a warp, and return that to
* every thread. This is only needed on Volta and later. On earlier architectures,
* we can just return the value that was passed in.
*/
__device__
int
reduceMax
(
int
val
)
{
#if __CUDA_ARCH__ >= 700
for
(
int
mask
=
16
;
mask
>
0
;
mask
/=
2
)
val
=
max
(
val
,
__shfl_xor
(
val
,
mask
));
for
(
int
mask
=
16
;
mask
>
0
;
mask
/=
2
)
val
=
max
(
val
,
__shfl_xor
(
val
,
mask
));
#endif
return
val
;
return
val
;
}
extern
"C"
__global__
void
computeInteractionGroups
(
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
groupData
,
const
int
*
__restrict__
numGroupTiles
,
bool
useNeighborList
,
...
...
platforms/opencl/src/kernels/customNonbondedGroups.cl
View file @
8a42f4f6
...
...
@@ -12,6 +12,22 @@ typedef struct {
#
endif
}
AtomData
;
/**
*
Find
the
maximum
of
a
value
across
all
threads
in
a
warp,
and
return
that
to
*
every
thread.
*/
int
reduceMax
(
int
val,
__local
int*
temp
)
{
int
indexInWarp
=
get_local_id
(
0
)
%32
;
temp[get_local_id
(
0
)
]
=
val
;
SYNC_WARPS
;
for
(
int
offset
=
16
; offset > 0; offset /= 2) {
if
(
offset
<
indexInWarp
)
temp[get_local_id
(
0
)
]
=
max
(
temp[get_local_id
(
0
)
],
temp[get_local_id
(
0
)
+offset]
)
;
SYNC_WARPS
;
}
return
temp[get_local_id
(
0
)
-indexInWarp]
;
}
/**
*
This
function
is
used
on
devices
that
don
't
support
64
bit
atomics.
Multiple
threads
within
*
a
single
tile
might
have
computed
forces
on
the
same
atom.
This
loops
over
them
and
makes
sure
...
...
@@ -53,6 +69,7 @@ __kernel void computeInteractionGroups(
mixed
energy
=
0
;
INIT_DERIVATIVES
__local
AtomData
localData[LOCAL_MEMORY_SIZE]
;
__local
int
reductionBuffer[LOCAL_MEMORY_SIZE]
;
const
unsigned
int
startTile
=
(
useNeighborList
?
warp*numGroupTiles[0]/totalWarps
:
FIRST_TILE+warp*
(
LAST_TILE-FIRST_TILE
)
/totalWarps
)
;
const
unsigned
int
endTile
=
(
useNeighborList
?
(
warp+1
)
*numGroupTiles[0]/totalWarps
:
FIRST_TILE+
(
warp+1
)
*
(
LAST_TILE-FIRST_TILE
)
/totalWarps
)
;
...
...
@@ -76,9 +93,10 @@ __kernel void computeInteractionGroups(
localData[get_local_id
(
0
)
].fy
=
0.0f
;
localData[get_local_id
(
0
)
].fz
=
0.0f
;
int
tj
=
tgx
;
int
rangeStop
=
rangeStart
+
reduceMax
(
rangeEnd-rangeStart,
reductionBuffer
)
;
SYNC_WARPS
;
for
(
int
j
=
rangeStart
; j < range
End
; j++) {
if
(
t
j
<
rangeEnd
)
{
for
(
int
j
=
rangeStart
; j < range
Stop
; j++) {
if
(
j
<
rangeEnd
)
{
bool
isExcluded
=
(((
exclusions>>tj
)
&1
)
==
0
)
;
int
localIndex
=
tbx+tj
;
posq2
=
(
real4
)
(
localData[localIndex].x,
localData[localIndex].y,
localData[localIndex].z,
localData[localIndex].q
)
;
...
...
@@ -161,6 +179,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
__local
real4
localPos[LOCAL_MEMORY_SIZE]
;
__local
volatile
bool
anyInteraction[WARPS_IN_BLOCK]
;
__local
volatile
int
tileIndex[WARPS_IN_BLOCK]
;
__local
int
reductionBuffer[LOCAL_MEMORY_SIZE]
;
const
unsigned
int
startTile
=
warp*NUM_TILES/totalWarps
;
const
unsigned
int
endTile
=
(
warp+1
)
*NUM_TILES/totalWarps
;
...
...
@@ -176,9 +195,10 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
if
(
tgx
==
0
)
anyInteraction[local_warp]
=
false
;
int
tj
=
tgx
;
int
rangeStop
=
rangeStart
+
reduceMax
(
rangeEnd-rangeStart,
reductionBuffer
)
;
SYNC_WARPS
;
for
(
int
j
=
rangeStart
; j < range
End
&& !anyInteraction[local_warp]; j++) {
if
(
t
j
<
rangeEnd
)
{
for
(
int
j
=
rangeStart
; j < range
Stop
&& !anyInteraction[local_warp]; j++) {
if
(
j
<
rangeEnd
)
{
bool
isExcluded
=
(((
exclusions>>tj
)
&1
)
==
0
)
;
int
localIndex
=
tbx+tj
;
real4
delta
=
(
real4
)
(
localPos[localIndex].xyz
-
posq1.xyz,
0
)
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment