Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
72def6fb
Unverified
Commit
72def6fb
authored
Apr 26, 2018
by
peastman
Committed by
GitHub
Apr 26, 2018
Browse files
Merge pull request #2051 from peastman/syncerror
Fixed incorrect synchronization on Volta
parents
c9eb7fb5
f1bbb8e7
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
74 additions
and
32 deletions
+74
-32
platforms/cuda/src/kernels/customNonbondedGroups.cu
platforms/cuda/src/kernels/customNonbondedGroups.cu
+49
-28
platforms/opencl/src/kernels/customNonbondedGroups.cl
platforms/opencl/src/kernels/customNonbondedGroups.cl
+25
-4
No files found.
platforms/cuda/src/kernels/customNonbondedGroups.cu
View file @
72def6fb
...
@@ -8,9 +8,22 @@ typedef struct {
...
@@ -8,9 +8,22 @@ typedef struct {
#endif
#endif
}
AtomData
;
}
AtomData
;
/**
* Find the maximum of a value across all threads in a warp, and return that to
* every thread. This is only needed on Volta and later. On earlier architectures,
* we can just return the value that was passed in.
*/
__device__
int
reduceMax
(
int
val
)
{
#if __CUDA_ARCH__ >= 700
for
(
int
mask
=
16
;
mask
>
0
;
mask
/=
2
)
val
=
max
(
val
,
__shfl_xor
(
val
,
mask
));
#endif
return
val
;
}
extern
"C"
__global__
void
computeInteractionGroups
(
extern
"C"
__global__
void
computeInteractionGroups
(
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
groupData
,
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
groupData
,
int
*
__restrict__
numGroupTiles
,
bool
useNeighborList
,
const
int
*
__restrict__
numGroupTiles
,
bool
useNeighborList
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
PARAMETER_ARGUMENTS
)
{
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
...
@@ -43,37 +56,42 @@ extern "C" __global__ void computeInteractionGroups(
...
@@ -43,37 +56,42 @@ extern "C" __global__ void computeInteractionGroups(
localData
[
threadIdx
.
x
].
fy
=
0.0
f
;
localData
[
threadIdx
.
x
].
fy
=
0.0
f
;
localData
[
threadIdx
.
x
].
fz
=
0.0
f
;
localData
[
threadIdx
.
x
].
fz
=
0.0
f
;
int
tj
=
tgx
;
int
tj
=
tgx
;
for
(
int
j
=
rangeStart
;
j
<
rangeEnd
;
j
++
)
{
int
rangeStop
=
rangeStart
+
reduceMax
(
rangeEnd
-
rangeStart
);
bool
isExcluded
=
(((
exclusions
>>
tj
)
&
1
)
==
0
);
SYNC_WARPS
;
int
localIndex
=
tbx
+
tj
;
for
(
int
j
=
rangeStart
;
j
<
rangeStop
;
j
++
)
{
posq2
=
make_real4
(
localData
[
localIndex
].
x
,
localData
[
localIndex
].
y
,
localData
[
localIndex
].
z
,
localData
[
localIndex
].
q
);
if
(
j
<
rangeEnd
)
{
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
bool
isExcluded
=
(((
exclusions
>>
tj
)
&
1
)
==
0
);
int
localIndex
=
tbx
+
tj
;
posq2
=
make_real4
(
localData
[
localIndex
].
x
,
localData
[
localIndex
].
y
,
localData
[
localIndex
].
z
,
localData
[
localIndex
].
q
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if
(
!
isExcluded
&&
r2
<
CUTOFF_SQUARED
)
{
if
(
!
isExcluded
&&
r2
<
CUTOFF_SQUARED
)
{
#endif
#endif
real
invR
=
RSQRT
(
r2
);
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
real
r
=
r2
*
invR
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
real
dEdR
=
0.0
f
;
real
dEdR
=
0.0
f
;
real
tempEnergy
=
0.0
f
;
real
tempEnergy
=
0.0
f
;
const
real
interactionScale
=
1.0
f
;
const
real
interactionScale
=
1.0
f
;
COMPUTE_INTERACTION
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
energy
+=
tempEnergy
;
delta
*=
dEdR
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
force
.
z
-=
delta
.
z
;
localData
[
localIndex
].
fx
+=
delta
.
x
;
localData
[
localIndex
].
fx
+=
delta
.
x
;
localData
[
localIndex
].
fy
+=
delta
.
y
;
localData
[
localIndex
].
fy
+=
delta
.
y
;
localData
[
localIndex
].
fz
+=
delta
.
z
;
localData
[
localIndex
].
fz
+=
delta
.
z
;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
}
}
#endif
#endif
tj
=
(
tj
==
rangeEnd
-
1
?
rangeStart
:
tj
+
1
);
tj
=
(
tj
==
rangeEnd
-
1
?
rangeStart
:
tj
+
1
);
}
SYNC_WARPS
;
}
}
if
(
exclusions
!=
0
)
{
if
(
exclusions
!=
0
)
{
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
...
@@ -83,6 +101,7 @@ extern "C" __global__ void computeInteractionGroups(
...
@@ -83,6 +101,7 @@ extern "C" __global__ void computeInteractionGroups(
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0x100000000
)));
SYNC_WARPS
;
}
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
SAVE_DERIVATIVES
SAVE_DERIVATIVES
...
@@ -134,9 +153,11 @@ extern "C" __global__ void buildNeighborList(int* __restrict__ rebuildNeighborL
...
@@ -134,9 +153,11 @@ extern "C" __global__ void buildNeighborList(int* __restrict__ rebuildNeighborL
if
(
tgx
==
0
)
if
(
tgx
==
0
)
anyInteraction
[
local_warp
]
=
false
;
anyInteraction
[
local_warp
]
=
false
;
int
tj
=
tgx
;
int
tj
=
tgx
;
int
rangeStop
=
rangeStart
+
reduceMax
(
rangeEnd
-
rangeStart
);
SYNC_WARPS
;
SYNC_WARPS
;
for
(
int
j
=
rangeStart
;
j
<
rangeEnd
&&
!
anyInteraction
[
local_warp
];
j
++
)
{
for
(
int
j
=
rangeStart
;
j
<
rangeStop
&&
!
anyInteraction
[
local_warp
];
j
++
)
{
if
(
tj
<
rangeEnd
)
{
SYNC_WARPS
;
if
(
j
<
rangeEnd
&&
tj
<
rangeEnd
)
{
bool
isExcluded
=
(((
exclusions
>>
tj
)
&
1
)
==
0
);
bool
isExcluded
=
(((
exclusions
>>
tj
)
&
1
)
==
0
);
int
localIndex
=
tbx
+
tj
;
int
localIndex
=
tbx
+
tj
;
real3
delta
=
make_real3
(
localPos
[
localIndex
].
x
-
posq1
.
x
,
localPos
[
localIndex
].
y
-
posq1
.
y
,
localPos
[
localIndex
].
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
localPos
[
localIndex
].
x
-
posq1
.
x
,
localPos
[
localIndex
].
y
-
posq1
.
y
,
localPos
[
localIndex
].
z
-
posq1
.
z
);
...
...
platforms/opencl/src/kernels/customNonbondedGroups.cl
View file @
72def6fb
...
@@ -12,6 +12,22 @@ typedef struct {
...
@@ -12,6 +12,22 @@ typedef struct {
#
endif
#
endif
}
AtomData
;
}
AtomData
;
/**
*
Find
the
maximum
of
a
value
across
all
threads
in
a
warp,
and
return
that
to
*
every
thread.
*/
int
reduceMax
(
int
val,
__local
int*
temp
)
{
int
indexInWarp
=
get_local_id
(
0
)
%32
;
temp[get_local_id
(
0
)
]
=
val
;
SYNC_WARPS
;
for
(
int
offset
=
16
; offset > 0; offset /= 2) {
if
(
offset
<
indexInWarp
)
temp[get_local_id
(
0
)
]
=
max
(
temp[get_local_id
(
0
)
],
temp[get_local_id
(
0
)
+offset]
)
;
SYNC_WARPS
;
}
return
temp[get_local_id
(
0
)
-indexInWarp]
;
}
/**
/**
*
This
function
is
used
on
devices
that
don
't
support
64
bit
atomics.
Multiple
threads
within
*
This
function
is
used
on
devices
that
don
't
support
64
bit
atomics.
Multiple
threads
within
*
a
single
tile
might
have
computed
forces
on
the
same
atom.
This
loops
over
them
and
makes
sure
*
a
single
tile
might
have
computed
forces
on
the
same
atom.
This
loops
over
them
and
makes
sure
...
@@ -53,6 +69,7 @@ __kernel void computeInteractionGroups(
...
@@ -53,6 +69,7 @@ __kernel void computeInteractionGroups(
mixed
energy
=
0
;
mixed
energy
=
0
;
INIT_DERIVATIVES
INIT_DERIVATIVES
__local
AtomData
localData[LOCAL_MEMORY_SIZE]
;
__local
AtomData
localData[LOCAL_MEMORY_SIZE]
;
__local
int
reductionBuffer[LOCAL_MEMORY_SIZE]
;
const
unsigned
int
startTile
=
(
useNeighborList
?
warp*numGroupTiles[0]/totalWarps
:
FIRST_TILE+warp*
(
LAST_TILE-FIRST_TILE
)
/totalWarps
)
;
const
unsigned
int
startTile
=
(
useNeighborList
?
warp*numGroupTiles[0]/totalWarps
:
FIRST_TILE+warp*
(
LAST_TILE-FIRST_TILE
)
/totalWarps
)
;
const
unsigned
int
endTile
=
(
useNeighborList
?
(
warp+1
)
*numGroupTiles[0]/totalWarps
:
FIRST_TILE+
(
warp+1
)
*
(
LAST_TILE-FIRST_TILE
)
/totalWarps
)
;
const
unsigned
int
endTile
=
(
useNeighborList
?
(
warp+1
)
*numGroupTiles[0]/totalWarps
:
FIRST_TILE+
(
warp+1
)
*
(
LAST_TILE-FIRST_TILE
)
/totalWarps
)
;
...
@@ -76,9 +93,10 @@ __kernel void computeInteractionGroups(
...
@@ -76,9 +93,10 @@ __kernel void computeInteractionGroups(
localData[get_local_id
(
0
)
].fy
=
0.0f
;
localData[get_local_id
(
0
)
].fy
=
0.0f
;
localData[get_local_id
(
0
)
].fz
=
0.0f
;
localData[get_local_id
(
0
)
].fz
=
0.0f
;
int
tj
=
tgx
;
int
tj
=
tgx
;
int
rangeStop
=
rangeStart
+
reduceMax
(
rangeEnd-rangeStart,
reductionBuffer
)
;
SYNC_WARPS
;
SYNC_WARPS
;
for
(
int
j
=
rangeStart
; j < range
End
; j++) {
for
(
int
j
=
rangeStart
; j < range
Stop
; j++) {
if
(
t
j
<
rangeEnd
)
{
if
(
j
<
rangeEnd
)
{
bool
isExcluded
=
(((
exclusions>>tj
)
&1
)
==
0
)
;
bool
isExcluded
=
(((
exclusions>>tj
)
&1
)
==
0
)
;
int
localIndex
=
tbx+tj
;
int
localIndex
=
tbx+tj
;
posq2
=
(
real4
)
(
localData[localIndex].x,
localData[localIndex].y,
localData[localIndex].z,
localData[localIndex].q
)
;
posq2
=
(
real4
)
(
localData[localIndex].x,
localData[localIndex].y,
localData[localIndex].z,
localData[localIndex].q
)
;
...
@@ -161,6 +179,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
...
@@ -161,6 +179,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
__local
real4
localPos[LOCAL_MEMORY_SIZE]
;
__local
real4
localPos[LOCAL_MEMORY_SIZE]
;
__local
volatile
bool
anyInteraction[WARPS_IN_BLOCK]
;
__local
volatile
bool
anyInteraction[WARPS_IN_BLOCK]
;
__local
volatile
int
tileIndex[WARPS_IN_BLOCK]
;
__local
volatile
int
tileIndex[WARPS_IN_BLOCK]
;
__local
int
reductionBuffer[LOCAL_MEMORY_SIZE]
;
const
unsigned
int
startTile
=
warp*NUM_TILES/totalWarps
;
const
unsigned
int
startTile
=
warp*NUM_TILES/totalWarps
;
const
unsigned
int
endTile
=
(
warp+1
)
*NUM_TILES/totalWarps
;
const
unsigned
int
endTile
=
(
warp+1
)
*NUM_TILES/totalWarps
;
...
@@ -176,9 +195,11 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
...
@@ -176,9 +195,11 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
if
(
tgx
==
0
)
if
(
tgx
==
0
)
anyInteraction[local_warp]
=
false
;
anyInteraction[local_warp]
=
false
;
int
tj
=
tgx
;
int
tj
=
tgx
;
int
rangeStop
=
rangeStart
+
reduceMax
(
rangeEnd-rangeStart,
reductionBuffer
)
;
SYNC_WARPS
;
SYNC_WARPS
;
for
(
int
j
=
rangeStart
; j < rangeEnd && !anyInteraction[local_warp]; j++) {
for
(
int
j
=
rangeStart
; j < rangeStop && !anyInteraction[local_warp]; j++) {
if
(
tj
<
rangeEnd
)
{
SYNC_WARPS
;
if
(
j
<
rangeEnd
)
{
bool
isExcluded
=
(((
exclusions>>tj
)
&1
)
==
0
)
;
bool
isExcluded
=
(((
exclusions>>tj
)
&1
)
==
0
)
;
int
localIndex
=
tbx+tj
;
int
localIndex
=
tbx+tj
;
real4
delta
=
(
real4
)
(
localPos[localIndex].xyz
-
posq1.xyz,
0
)
;
real4
delta
=
(
real4
)
(
localPos[localIndex].xyz
-
posq1.xyz,
0
)
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment