Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
8a42f4f6
"wrappers/python/tests/vscode:/vscode.git/clone" did not exist on "813772e87404467f871f7a4a3c176022a958e0e9"
Commit
8a42f4f6
authored
Apr 20, 2018
by
peastman
Browse files
Fixed incorrect synchronization on Volta for OpenCL
parent
9f92ae40
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
32 additions
and
11 deletions
+32
-11
platforms/cuda/src/kernels/customNonbondedGroups.cu
platforms/cuda/src/kernels/customNonbondedGroups.cu
+8
-7
platforms/opencl/src/kernels/customNonbondedGroups.cl
platforms/opencl/src/kernels/customNonbondedGroups.cl
+24
-4
No files found.
platforms/cuda/src/kernels/customNonbondedGroups.cu
View file @
8a42f4f6
...
@@ -8,9 +8,11 @@ typedef struct {
...
@@ -8,9 +8,11 @@ typedef struct {
#endif
#endif
}
AtomData
;
}
AtomData
;
// Find the maximum of a value across all threads in a warp, and return that to
/**
// every thread. This is only needed on Volta and later. On earlier architectures,
* Find the maximum of a value across all threads in a warp, and return that to
// we can just return the value that was passed in.
* every thread. This is only needed on Volta and later. On earlier architectures,
* we can just return the value that was passed in.
*/
__device__
int
reduceMax
(
int
val
)
{
__device__
int
reduceMax
(
int
val
)
{
#if __CUDA_ARCH__ >= 700
#if __CUDA_ARCH__ >= 700
for
(
int
mask
=
16
;
mask
>
0
;
mask
/=
2
)
for
(
int
mask
=
16
;
mask
>
0
;
mask
/=
2
)
...
@@ -19,7 +21,6 @@ __device__ int reduceMax(int val) {
...
@@ -19,7 +21,6 @@ __device__ int reduceMax(int val) {
return
val
;
return
val
;
}
}
extern
"C"
__global__
void
computeInteractionGroups
(
extern
"C"
__global__
void
computeInteractionGroups
(
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
groupData
,
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
groupData
,
const
int
*
__restrict__
numGroupTiles
,
bool
useNeighborList
,
const
int
*
__restrict__
numGroupTiles
,
bool
useNeighborList
,
...
...
platforms/opencl/src/kernels/customNonbondedGroups.cl
View file @
8a42f4f6
...
@@ -12,6 +12,22 @@ typedef struct {
...
@@ -12,6 +12,22 @@ typedef struct {
#
endif
#
endif
}
AtomData
;
}
AtomData
;
/**
*
Find
the
maximum
of
a
value
across
all
threads
in
a
warp,
and
return
that
to
*
every
thread.
*/
int
reduceMax
(
int
val,
__local
int*
temp
)
{
int
indexInWarp
=
get_local_id
(
0
)
%32
;
temp[get_local_id
(
0
)
]
=
val
;
SYNC_WARPS
;
for
(
int
offset
=
16
; offset > 0; offset /= 2) {
if
(
offset
<
indexInWarp
)
temp[get_local_id
(
0
)
]
=
max
(
temp[get_local_id
(
0
)
],
temp[get_local_id
(
0
)
+offset]
)
;
SYNC_WARPS
;
}
return
temp[get_local_id
(
0
)
-indexInWarp]
;
}
/**
/**
*
This
function
is
used
on
devices
that
don
't
support
64
bit
atomics.
Multiple
threads
within
*
This
function
is
used
on
devices
that
don
't
support
64
bit
atomics.
Multiple
threads
within
*
a
single
tile
might
have
computed
forces
on
the
same
atom.
This
loops
over
them
and
makes
sure
*
a
single
tile
might
have
computed
forces
on
the
same
atom.
This
loops
over
them
and
makes
sure
...
@@ -53,6 +69,7 @@ __kernel void computeInteractionGroups(
...
@@ -53,6 +69,7 @@ __kernel void computeInteractionGroups(
mixed
energy
=
0
;
mixed
energy
=
0
;
INIT_DERIVATIVES
INIT_DERIVATIVES
__local
AtomData
localData[LOCAL_MEMORY_SIZE]
;
__local
AtomData
localData[LOCAL_MEMORY_SIZE]
;
__local
int
reductionBuffer[LOCAL_MEMORY_SIZE]
;
const
unsigned
int
startTile
=
(
useNeighborList
?
warp*numGroupTiles[0]/totalWarps
:
FIRST_TILE+warp*
(
LAST_TILE-FIRST_TILE
)
/totalWarps
)
;
const
unsigned
int
startTile
=
(
useNeighborList
?
warp*numGroupTiles[0]/totalWarps
:
FIRST_TILE+warp*
(
LAST_TILE-FIRST_TILE
)
/totalWarps
)
;
const
unsigned
int
endTile
=
(
useNeighborList
?
(
warp+1
)
*numGroupTiles[0]/totalWarps
:
FIRST_TILE+
(
warp+1
)
*
(
LAST_TILE-FIRST_TILE
)
/totalWarps
)
;
const
unsigned
int
endTile
=
(
useNeighborList
?
(
warp+1
)
*numGroupTiles[0]/totalWarps
:
FIRST_TILE+
(
warp+1
)
*
(
LAST_TILE-FIRST_TILE
)
/totalWarps
)
;
...
@@ -76,9 +93,10 @@ __kernel void computeInteractionGroups(
...
@@ -76,9 +93,10 @@ __kernel void computeInteractionGroups(
localData[get_local_id
(
0
)
].fy
=
0.0f
;
localData[get_local_id
(
0
)
].fy
=
0.0f
;
localData[get_local_id
(
0
)
].fz
=
0.0f
;
localData[get_local_id
(
0
)
].fz
=
0.0f
;
int
tj
=
tgx
;
int
tj
=
tgx
;
int
rangeStop
=
rangeStart
+
reduceMax
(
rangeEnd-rangeStart,
reductionBuffer
)
;
SYNC_WARPS
;
SYNC_WARPS
;
for
(
int
j
=
rangeStart
; j < range
End
; j++) {
for
(
int
j
=
rangeStart
; j < range
Stop
; j++) {
if
(
t
j
<
rangeEnd
)
{
if
(
j
<
rangeEnd
)
{
bool
isExcluded
=
(((
exclusions>>tj
)
&1
)
==
0
)
;
bool
isExcluded
=
(((
exclusions>>tj
)
&1
)
==
0
)
;
int
localIndex
=
tbx+tj
;
int
localIndex
=
tbx+tj
;
posq2
=
(
real4
)
(
localData[localIndex].x,
localData[localIndex].y,
localData[localIndex].z,
localData[localIndex].q
)
;
posq2
=
(
real4
)
(
localData[localIndex].x,
localData[localIndex].y,
localData[localIndex].z,
localData[localIndex].q
)
;
...
@@ -161,6 +179,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
...
@@ -161,6 +179,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
__local
real4
localPos[LOCAL_MEMORY_SIZE]
;
__local
real4
localPos[LOCAL_MEMORY_SIZE]
;
__local
volatile
bool
anyInteraction[WARPS_IN_BLOCK]
;
__local
volatile
bool
anyInteraction[WARPS_IN_BLOCK]
;
__local
volatile
int
tileIndex[WARPS_IN_BLOCK]
;
__local
volatile
int
tileIndex[WARPS_IN_BLOCK]
;
__local
int
reductionBuffer[LOCAL_MEMORY_SIZE]
;
const
unsigned
int
startTile
=
warp*NUM_TILES/totalWarps
;
const
unsigned
int
startTile
=
warp*NUM_TILES/totalWarps
;
const
unsigned
int
endTile
=
(
warp+1
)
*NUM_TILES/totalWarps
;
const
unsigned
int
endTile
=
(
warp+1
)
*NUM_TILES/totalWarps
;
...
@@ -176,9 +195,10 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
...
@@ -176,9 +195,10 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
if
(
tgx
==
0
)
if
(
tgx
==
0
)
anyInteraction[local_warp]
=
false
;
anyInteraction[local_warp]
=
false
;
int
tj
=
tgx
;
int
tj
=
tgx
;
int
rangeStop
=
rangeStart
+
reduceMax
(
rangeEnd-rangeStart,
reductionBuffer
)
;
SYNC_WARPS
;
SYNC_WARPS
;
for
(
int
j
=
rangeStart
; j < range
End
&& !anyInteraction[local_warp]; j++) {
for
(
int
j
=
rangeStart
; j < range
Stop
&& !anyInteraction[local_warp]; j++) {
if
(
t
j
<
rangeEnd
)
{
if
(
j
<
rangeEnd
)
{
bool
isExcluded
=
(((
exclusions>>tj
)
&1
)
==
0
)
;
bool
isExcluded
=
(((
exclusions>>tj
)
&1
)
==
0
)
;
int
localIndex
=
tbx+tj
;
int
localIndex
=
tbx+tj
;
real4
delta
=
(
real4
)
(
localPos[localIndex].xyz
-
posq1.xyz,
0
)
;
real4
delta
=
(
real4
)
(
localPos[localIndex].xyz
-
posq1.xyz,
0
)
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment