Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
e7b18ca4
Commit
e7b18ca4
authored
Oct 20, 2010
by
Peter Eastman
Browse files
Began creating kernels optimized for CPU
parent
a33839b0
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
518 additions
and
45 deletions
+518
-45
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+32
-20
platforms/opencl/src/OpenCLNonbondedUtilities.h
platforms/opencl/src/OpenCLNonbondedUtilities.h
+1
-1
platforms/opencl/src/kernels/findInteractingBlocks.cl
platforms/opencl/src/kernels/findInteractingBlocks.cl
+2
-1
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
+155
-0
platforms/opencl/src/kernels/nonbonded_cpu.cl
platforms/opencl/src/kernels/nonbonded_cpu.cl
+297
-0
platforms/opencl/src/kernels/nonbonded_default.cl
platforms/opencl/src/kernels/nonbonded_default.cl
+16
-12
platforms/opencl/src/kernels/nonbonded_nvidia.cl
platforms/opencl/src/kernels/nonbonded_nvidia.cl
+15
-11
No files found.
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
e7b18ca4
...
...
@@ -40,7 +40,11 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
)
{
// Decide how many force buffers to use.
deviceIsCpu
=
(
context
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
);
forceBufferPerAtomBlock
=
false
;
if
(
deviceIsCpu
)
numForceBuffers
=
context
.
getNumThreadBlocks
();
else
{
numForceBuffers
=
context
.
getNumThreadBlocks
()
*
OpenCLContext
::
ThreadBlockSize
/
OpenCLContext
::
TileSize
;
if
(
numForceBuffers
>=
context
.
getNumAtomBlocks
())
{
// For small systems, it is more efficient to have one force buffer per block of 32 atoms instead of one per warp.
...
...
@@ -48,6 +52,7 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
forceBufferPerAtomBlock
=
true
;
numForceBuffers
=
context
.
getNumAtomBlocks
();
}
}
}
OpenCLNonbondedUtilities
::~
OpenCLNonbondedUtilities
()
{
...
...
@@ -210,8 +215,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
if
(
maxInteractingTiles
>
numTiles
)
maxInteractingTiles
=
numTiles
;
interactingTiles
=
new
OpenCLArray
<
mm_ushort2
>
(
context
,
maxInteractingTiles
,
"interactingTiles"
);
if
(
context
.
getSIMDWidth
()
==
32
)
interactionFlags
=
new
OpenCLArray
<
cl_uint
>
(
context
,
maxInteractingTiles
,
"interactionFlags"
);
interactionFlags
=
new
OpenCLArray
<
cl_uint
>
(
context
,
context
.
getSIMDWidth
()
==
32
||
deviceIsCpu
?
maxInteractingTiles
:
1
,
"interactionFlags"
);
interactionCount
=
new
OpenCLArray
<
cl_uint
>
(
context
,
1
,
"interactionCount"
,
true
);
blockCenter
=
new
OpenCLArray
<
mm_float4
>
(
context
,
numAtomBlocks
,
"blockCenter"
);
blockBoundingBox
=
new
OpenCLArray
<
mm_float4
>
(
context
,
numAtomBlocks
,
"blockBoundingBox"
);
...
...
@@ -229,7 +233,8 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
defines
[
"USE_OUTPUT_BUFFER_PER_BLOCK"
]
=
"1"
;
if
(
usePeriodic
)
defines
[
"USE_PERIODIC"
]
=
"1"
;
cl
::
Program
interactingBlocksProgram
=
context
.
createProgram
(
OpenCLKernelSources
::
findInteractingBlocks
,
defines
);
string
file
=
(
deviceIsCpu
?
OpenCLKernelSources
::
findInteractingBlocks_cpu
:
OpenCLKernelSources
::
findInteractingBlocks
);
cl
::
Program
interactingBlocksProgram
=
context
.
createProgram
(
file
,
defines
);
findBlockBoundsKernel
=
cl
::
Kernel
(
interactingBlocksProgram
,
"findBlockBounds"
);
findBlockBoundsKernel
.
setArg
<
cl_int
>
(
0
,
context
.
getNumAtoms
());
findBlockBoundsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
context
.
getPosq
().
getDeviceBuffer
());
...
...
@@ -242,9 +247,10 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
blockBoundingBox
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
interactionCount
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
interactingTiles
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
context
.
getPosq
().
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
8
,
interactingTiles
->
getSize
());
if
(
context
.
getSIMDWidth
()
==
32
)
{
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
context
.
getPosq
().
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
9
,
interactingTiles
->
getSize
());
if
(
context
.
getSIMDWidth
()
==
32
&&
!
deviceIsCpu
)
{
findInteractionsWithinBlocksKernel
=
cl
::
Kernel
(
interactingBlocksProgram
,
"findInteractionsWithinBlocks"
);
findInteractionsWithinBlocksKernel
.
setArg
<
cl_float
>
(
0
,
(
cl_float
)
(
cutoff
*
cutoff
));
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
context
.
getPosq
().
getDeviceBuffer
());
...
...
@@ -279,8 +285,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
context
.
executeKernel
(
findBlockBoundsKernel
,
context
.
getNumAtoms
());
findInteractingBlocksKernel
.
setArg
<
mm_float4
>
(
1
,
context
.
getPeriodicBoxSize
());
findInteractingBlocksKernel
.
setArg
<
mm_float4
>
(
2
,
context
.
getInvPeriodicBoxSize
());
context
.
executeKernel
(
findInteractingBlocksKernel
,
context
.
getNumAtoms
());
if
(
context
.
getSIMDWidth
()
==
32
)
{
context
.
executeKernel
(
findInteractingBlocksKernel
,
context
.
getNumAtoms
()
,
deviceIsCpu
?
1
:
-
1
);
if
(
context
.
getSIMDWidth
()
==
32
&&
!
deviceIsCpu
)
{
findInteractionsWithinBlocksKernel
.
setArg
<
mm_float4
>
(
1
,
context
.
getPeriodicBoxSize
());
findInteractionsWithinBlocksKernel
.
setArg
<
mm_float4
>
(
2
,
context
.
getInvPeriodicBoxSize
());
context
.
executeKernel
(
findInteractionsWithinBlocksKernel
,
context
.
getNumAtoms
());
...
...
@@ -293,7 +299,7 @@ void OpenCLNonbondedUtilities::computeInteractions() {
forceKernel
.
setArg
<
mm_float4
>
(
10
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
11
,
context
.
getInvPeriodicBoxSize
());
}
context
.
executeKernel
(
forceKernel
,
(
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
)
*
OpenCLContext
::
TileSize
);
context
.
executeKernel
(
forceKernel
,
(
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
)
*
OpenCLContext
::
TileSize
,
deviceIsCpu
?
1
:
-
1
);
}
}
...
...
@@ -373,23 +379,23 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
stringstream
loadLocal1
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
loadLocal1
<<
"localData[
get_
local
_id(0)
]."
<<
params
[
i
].
getName
()
<<
" = "
<<
params
[
i
].
getName
()
<<
"1;
\n
"
;
loadLocal1
<<
"localData[local
AtomIndex
]."
<<
params
[
i
].
getName
()
<<
" = "
<<
params
[
i
].
getName
()
<<
"1;
\n
"
;
}
else
{
for
(
int
j
=
0
;
j
<
params
[
i
].
getNumComponents
();
++
j
)
loadLocal1
<<
"localData[
get_
local
_id(0)
]."
<<
params
[
i
].
getName
()
<<
"_"
<<
suffixes
[
j
]
<<
" = "
<<
params
[
i
].
getName
()
<<
"1."
<<
suffixes
[
j
]
<<
";
\n
"
;
loadLocal1
<<
"localData[local
AtomIndex
]."
<<
params
[
i
].
getName
()
<<
"_"
<<
suffixes
[
j
]
<<
" = "
<<
params
[
i
].
getName
()
<<
"1."
<<
suffixes
[
j
]
<<
";
\n
"
;
}
}
replacements
[
"LOAD_LOCAL_PARAMETERS_FROM_1"
]
=
loadLocal1
.
str
();
stringstream
loadLocal2
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
if
(
params
[
i
].
getNumComponents
()
==
1
)
{
loadLocal2
<<
"localData[
get_
local
_id(0)
]."
<<
params
[
i
].
getName
()
<<
" = global_"
<<
params
[
i
].
getName
()
<<
"[j];
\n
"
;
loadLocal2
<<
"localData[local
AtomIndex
]."
<<
params
[
i
].
getName
()
<<
" = global_"
<<
params
[
i
].
getName
()
<<
"[j];
\n
"
;
}
else
{
loadLocal2
<<
params
[
i
].
getType
()
<<
" temp_"
<<
params
[
i
].
getName
()
<<
" = global_"
<<
params
[
i
].
getName
()
<<
"[j];
\n
"
;
for
(
int
j
=
0
;
j
<
params
[
i
].
getNumComponents
();
++
j
)
loadLocal2
<<
"localData[
get_
local
_id(0)
]."
<<
params
[
i
].
getName
()
<<
"_"
<<
suffixes
[
j
]
<<
" = temp_"
<<
params
[
i
].
getName
()
<<
"."
<<
suffixes
[
j
]
<<
";
\n
"
;
loadLocal2
<<
"localData[local
AtomIndex
]."
<<
params
[
i
].
getName
()
<<
"_"
<<
suffixes
[
j
]
<<
" = temp_"
<<
params
[
i
].
getName
()
<<
"."
<<
suffixes
[
j
]
<<
";
\n
"
;
}
}
replacements
[
"LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"
]
=
loadLocal2
.
str
();
...
...
@@ -434,7 +440,13 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines
[
"NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtomBlocks
());
string
file
=
(
context
.
getSIMDWidth
()
==
32
?
OpenCLKernelSources
::
nonbonded_nvidia
:
OpenCLKernelSources
::
nonbonded_default
);
string
file
;
if
(
deviceIsCpu
)
file
=
OpenCLKernelSources
::
nonbonded_cpu
;
else
if
(
context
.
getSIMDWidth
()
==
32
)
file
=
OpenCLKernelSources
::
nonbonded_nvidia
;
else
file
=
OpenCLKernelSources
::
nonbonded_default
;
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
file
,
replacements
),
defines
);
cl
::
Kernel
kernel
(
program
,
"computeNonbonded"
);
...
...
@@ -454,7 +466,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
interactionCount
->
getDeviceBuffer
());
index
+=
2
;
// The periodic box size arguments are set when the kernel is executed.
kernel
.
setArg
<
cl_uint
>
(
index
++
,
interactingTiles
->
getSize
());
if
(
context
.
getSIMDWidth
()
==
32
)
if
(
context
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
interactionFlags
->
getDeviceBuffer
());
}
else
{
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.h
View file @
e7b18ca4
...
...
@@ -211,7 +211,7 @@ private:
std
::
string
kernelSource
;
std
::
map
<
std
::
string
,
std
::
string
>
kernelDefines
;
double
cutoff
;
bool
useCutoff
,
usePeriodic
,
forceBufferPerAtomBlock
;
bool
useCutoff
,
usePeriodic
,
forceBufferPerAtomBlock
,
deviceIsCpu
;
int
numForceBuffers
;
};
...
...
platforms/opencl/src/kernels/findInteractingBlocks.cl
View file @
e7b18ca4
...
...
@@ -158,7 +158,8 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
*
mark
them
as
non-interacting.
*/
__kernel
void
findBlocksWithInteractions
(
float
cutoffSquared,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
__global
float4*
blockCenter,
__global
float4*
blockBoundingBox,
__global
unsigned
int*
interactionCount,
__global
ushort2*
interactingTiles,
__global
float4*
posq,
unsigned
int
maxTiles
)
{
__global
float4*
blockBoundingBox,
__global
unsigned
int*
interactionCount,
__global
ushort2*
interactingTiles,
__global
unsigned
int*
interactionFlags,
__global
float4*
posq,
unsigned
int
maxTiles
)
{
__local
ushort2
buffer[BUFFER_SIZE]
;
__local
int
valid[BUFFER_SIZE]
;
__local
short
sum[BUFFER_SIZE]
;
...
...
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
0 → 100644
View file @
e7b18ca4
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_byte_addressable_store
:
enable
#
define
TILE_SIZE
32
#
define
GROUP_SIZE
64
#
define
BUFFER_GROUPS
4
#
define
BUFFER_SIZE
BUFFER_GROUPS*GROUP_SIZE
/**
*
Find
a
bounding
box
for
the
atoms
in
each
block.
*/
__kernel
void
findBlockBounds
(
int
numAtoms,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
__global
float4*
posq,
__global
float4*
blockCenter,
__global
float4*
blockBoundingBox,
__global
unsigned
int*
interactionCount
)
{
int
index
=
get_global_id
(
0
)
;
int
base
=
index*TILE_SIZE
;
while
(
base
<
numAtoms
)
{
float4
pos
=
posq[base]
;
#
ifdef
USE_PERIODIC
pos.x
-=
floor
(
pos.x*invPeriodicBoxSize.x
)
*periodicBoxSize.x
;
pos.y
-=
floor
(
pos.y*invPeriodicBoxSize.y
)
*periodicBoxSize.y
;
pos.z
-=
floor
(
pos.z*invPeriodicBoxSize.z
)
*periodicBoxSize.z
;
float4
firstPoint
=
pos
;
#
endif
float4
minPos
=
pos
;
float4
maxPos
=
pos
;
int
last
=
min
(
base+TILE_SIZE,
numAtoms
)
;
for
(
int
i
=
base+1
; i < last; i++) {
pos
=
posq[i]
;
#
ifdef
USE_PERIODIC
pos.x
-=
floor
((
pos.x-firstPoint.x
)
*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
pos.y
-=
floor
((
pos.y-firstPoint.y
)
*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
pos.z
-=
floor
((
pos.z-firstPoint.z
)
*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
minPos
=
min
(
minPos,
pos
)
;
maxPos
=
max
(
maxPos,
pos
)
;
}
blockBoundingBox[index]
=
0.5f*
(
maxPos-minPos
)
;
blockCenter[index]
=
0.5f*
(
maxPos+minPos
)
;
index
+=
get_global_size
(
0
)
;
base
=
index*TILE_SIZE
;
}
if
(
get_global_id
(
0
)
==
0
)
interactionCount[0]
=
0
;
}
/**
*
This
is
called
by
findBlocksWithInteractions
()
.
It
compacts
the
list
of
blocks
and
writes
them
*
to
global
memory.
*/
void
storeInteractionData
(
__local
ushort2*
buffer,
int
numValid,
__local
unsigned
int*
flagsBuffer,
__local
float4*
temp,
__global
unsigned
int*
interactionCount,
__global
ushort2*
interactingTiles,
__global
unsigned
int*
interactionFlags,
float
cutoffSquared,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
__global
float4*
posq,
__global
float4*
blockCenter,
__global
float4*
blockBoundingBox,
unsigned
int
maxTiles
)
{
//
Filter
the
list
of
tiles
by
comparing
the
distance
from
each
atom
to
the
other
bounding
box.
int
lasty
=
-1
;
for
(
int
tile
=
0
; tile < numValid; ) {
int
x
=
buffer[tile].x
;
int
y
=
buffer[tile].y
;
if
(
x
==
y
)
{
tile++
;
continue
;
}
//
Load
the
atom
positions
and
the
bounding
box
of
the
other
block.
float4
center
=
blockCenter[x]
;
float4
boxSize
=
blockBoundingBox[x]
;
if
(
y
!=
lasty
)
for
(
int
atom
=
0
; atom < TILE_SIZE; atom++)
temp[atom]
=
posq[y*TILE_SIZE+atom]
;
lasty
=
y
;
//
Find
the
distance
of
each
atom
from
the
bounding
box.
unsigned
int
flags
=
0
;
for
(
int
atom
=
0
; atom < TILE_SIZE; atom++) {
float4
delta
=
temp[atom]-center
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
delta
=
max
((
float4
)
0.0f,
fabs
(
delta
)
-boxSize
)
;
if
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
cutoffSquared
)
flags
+=
1
<<
atom
;
}
if
(
flags
==
0
)
{
//
This
tile
contains
no
interactions.
numValid--
;
buffer[tile]
=
buffer[numValid]
;
}
else
{
flagsBuffer[tile]
=
flags
;
tile++
;
}
}
//
Store
it
to
global
memory.
int
baseIndex
=
atom_add
(
interactionCount,
numValid
)
;
if
(
baseIndex+numValid
<=
maxTiles
)
for
(
int
i
=
0
; i < numValid; i++) {
interactingTiles[baseIndex+i]
=
buffer[i]
;
interactionFlags[baseIndex+i]
=
flagsBuffer[i]
;
}
}
/**
*
Compare
the
bounding
boxes
for
each
pair
of
blocks.
If
they
are
sufficiently
far
apart,
*
mark
them
as
non-interacting.
*/
__kernel
void
findBlocksWithInteractions
(
float
cutoffSquared,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
__global
float4*
blockCenter,
__global
float4*
blockBoundingBox,
__global
unsigned
int*
interactionCount,
__global
ushort2*
interactingTiles,
__global
unsigned
int*
interactionFlags,
__global
float4*
posq,
unsigned
int
maxTiles
)
{
__local
ushort2
buffer[BUFFER_SIZE]
;
__local
unsigned
int
flagsBuffer[BUFFER_SIZE]
;
__local
float4
temp[TILE_SIZE]
;
int
valuesInBuffer
=
0
;
const
int
numTiles
=
(
NUM_BLOCKS*
(
NUM_BLOCKS+1
))
/2
;
unsigned
int
start
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
for
(
int
index
=
start
; index < end; index++) {
//
Identify
the
pair
of
blocks
to
compare.
unsigned
int
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*index
))
;
unsigned
int
x
=
(
index-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y++
;
x
=
(
index-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
//
Find
the
distance
between
the
bounding
boxes
of
the
two
cells.
float4
delta
=
blockCenter[x]-blockCenter[y]
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
float4
boxSizea
=
blockBoundingBox[x]
;
float4
boxSizeb
=
blockBoundingBox[y]
;
delta.x
=
max
(
0.0f,
fabs
(
delta.x
)
-boxSizea.x-boxSizeb.x
)
;
delta.y
=
max
(
0.0f,
fabs
(
delta.y
)
-boxSizea.y-boxSizeb.y
)
;
delta.z
=
max
(
0.0f,
fabs
(
delta.z
)
-boxSizea.z-boxSizeb.z
)
;
if
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
cutoffSquared
)
{
//
Add
this
tile
to
the
buffer.
buffer[valuesInBuffer++]
=
(
ushort2
)
(
x,
y
)
;
if
(
valuesInBuffer
==
BUFFER_SIZE
)
{
storeInteractionData
(
buffer,
valuesInBuffer,
flagsBuffer,
temp,
interactionCount,
interactingTiles,
interactionFlags,
cutoffSquared,
periodicBoxSize,
invPeriodicBoxSize,
posq,
blockCenter,
blockBoundingBox,
maxTiles
)
;
valuesInBuffer
=
0
;
}
}
}
storeInteractionData
(
buffer,
valuesInBuffer,
flagsBuffer,
temp,
interactionCount,
interactingTiles,
interactionFlags,
cutoffSquared,
periodicBoxSize,
invPeriodicBoxSize,
posq,
blockCenter,
blockBoundingBox,
maxTiles
)
;
}
platforms/opencl/src/kernels/nonbonded_cpu.cl
0 → 100644
View file @
e7b18ca4
#
define
TILE_SIZE
32
typedef
struct
{
float
x,
y,
z
;
float
q
;
float
fx,
fy,
fz
;
ATOM_PARAMETER_DATA
}
AtomData
;
/**
*
Compute
nonbonded
interactions.
*/
__kernel
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
unsigned
int
numTiles
#
endif
PARAMETER_ARGUMENTS
)
{
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
#
else
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x,
y
;
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y++
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
}
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
//
Locate
the
exclusion
data
for
this
tile.
#
ifdef
USE_EXCLUSIONS
unsigned
int
exclusionStart
=
exclusionRowIndices[x]
;
unsigned
int
exclusionEnd
=
exclusionRowIndices[x+1]
;
int
exclusionIndex
=
-1
;
for
(
int
i
=
exclusionStart
; i < exclusionEnd; i++)
if
(
exclusionIndices[i]
==
y
)
{
exclusionIndex
=
i*TILE_SIZE
;
break
;
}
bool
hasExclusions
=
(
exclusionIndex
>
-1
)
;
#
endif
//
Load
the
data
for
this
tile
if
we
don
't
already
have
it
cached.
if
(
lasty
!=
y
)
{
for
(
int
localAtomIndex
=
0
; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned
int
j
=
y*TILE_SIZE
+
localAtomIndex
;
float4
tempPosq
=
posq[j]
;
localData[localAtomIndex].x
=
tempPosq.x
;
localData[localAtomIndex].y
=
tempPosq.y
;
localData[localAtomIndex].z
=
tempPosq.z
;
localData[localAtomIndex].q
=
tempPosq.w
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
}
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
exclusions[exclusionIndex+tgx]
;
#
endif
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
float4
force
=
0.0f
;
float4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
float4
posq2
=
(
float4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
#
ifdef
USE_SYMMETRIC
float
dEdR
=
0.0f
;
#
else
float4
dEdR1
=
(
float4
)
0.0f
;
float4
dEdR2
=
(
float4
)
0.0f
;
#
endif
float
tempEnergy
=
0.0f
;
COMPUTE_INTERACTION
energy
+=
0.5f*tempEnergy
;
#
ifdef
USE_SYMMETRIC
force.xyz
-=
delta.xyz*dEdR
;
#
else
force.xyz
-=
dEdR1.xyz
;
#
endif
#
ifdef
USE_CUTOFF
}
#
endif
excl
>>=
1
;
}
//
Write
results.
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
=
forceBuffers[offset].xyz+force.xyz
;
}
}
else
{
//
This
is
an
off-diagonal
tile.
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
localData[tgx].fx
=
0.0f
;
localData[tgx].fy
=
0.0f
;
localData[tgx].fz
=
0.0f
;
}
#
ifdef
USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags[pos]
:
0xFFFFFFFF
)
;
if
(
!hasExclusions
&&
flags
!=
0xFFFFFFFF
)
{
if
(
flags
==
0
)
{
//
No
interactions
in
this
tile.
}
else
{
//
Compute
only
a
subset
of
the
interactions
in
this
tile.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
float4
force
=
0.0f
;
float4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
if
((
flags&
(
1<<j
))
!=
0
)
{
bool
isExcluded
=
false
;
float4
posq2
=
(
float4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
#
ifdef
USE_SYMMETRIC
float
dEdR
=
0.0f
;
#
else
float4
dEdR1
=
(
float4
)
0.0f
;
float4
dEdR2
=
(
float4
)
0.0f
;
#
endif
float
tempEnergy
=
0.0f
;
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
#
ifdef
USE_SYMMETRIC
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
localData[j].fx
+=
delta.x
;
localData[j].fy
+=
delta.y
;
localData[j].fz
+=
delta.z
;
#
else
force.xyz
-=
dEdR1.xyz
;
localData[j].fx
+=
dEdR2.x
;
localData[j].fy
+=
dEdR2.y
;
localData[j].fz
+=
dEdR2.z
;
#
endif
#
ifdef
USE_CUTOFF
}
#
endif
}
}
//
Write
results
for
atom1.
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
=
forceBuffers[offset].xyz+force.xyz
;
}
}
}
else
#
endif
{
//
Compute
the
full
set
of
interactions
in
this
tile.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
float4
force
=
0.0f
;
float4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionIndex+tgx]
:
0xFFFFFFFF
)
;
#
endif
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
float4
posq2
=
(
float4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
#
ifdef
USE_SYMMETRIC
float
dEdR
=
0.0f
;
#
else
float4
dEdR1
=
(
float4
)
0.0f
;
float4
dEdR2
=
(
float4
)
0.0f
;
#
endif
float
tempEnergy
=
0.0f
;
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
#
ifdef
USE_SYMMETRIC
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
localData[j].fx
+=
delta.x
;
localData[j].fy
+=
delta.y
;
localData[j].fz
+=
delta.z
;
#
else
force.xyz
-=
dEdR1.xyz
;
localData[j].fx
+=
dEdR2.x
;
localData[j].fy
+=
dEdR2.y
;
localData[j].fz
+=
dEdR2.z
;
#
endif
#
ifdef
USE_CUTOFF
}
#
endif
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
}
//
Write
results
for
atom1.
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
=
forceBuffers[offset].xyz+force.xyz
;
}
}
//
Write
results.
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
offset
=
y*TILE_SIZE+tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
float4
f
=
forceBuffers[offset]
;
f.x
+=
localData[tgx].fx
;
f.y
+=
localData[tgx].fy
;
f.z
+=
localData[tgx].fz
;
forceBuffers[offset]
=
f
;
}
}
lasty
=
y
;
pos++
;
}
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
platforms/opencl/src/kernels/nonbonded_default.cl
View file @
e7b18ca4
...
...
@@ -77,10 +77,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
localData[get_local_id
(
0
)
].x
=
posq1.x
;
localData[get_local_id
(
0
)
].y
=
posq1.y
;
localData[get_local_id
(
0
)
].z
=
posq1.z
;
localData[get_local_id
(
0
)
].q
=
posq1.w
;
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
localData[localAtomIndex].x
=
posq1.x
;
localData[localAtomIndex].y
=
posq1.y
;
localData[localAtomIndex].z
=
posq1.z
;
localData[localAtomIndex].q
=
posq1.w
;
LOAD_LOCAL_PARAMETERS_FROM_1
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
ifdef
USE_EXCLUSIONS
...
...
@@ -137,18 +138,19 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
else
{
//
This
is
an
off-diagonal
tile.
if
(
lasty
!=
y
&&
get_local_id
(
0
)
<
TILE_SIZE
)
{
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
if
(
lasty
!=
y
&&
localAtomIndex
<
TILE_SIZE
)
{
unsigned
int
j
=
y*TILE_SIZE
+
tgx
;
float4
tempPosq
=
posq[j]
;
localData[
get_
local
_id
(
0
)
].x
=
tempPosq.x
;
localData[
get_
local
_id
(
0
)
].y
=
tempPosq.y
;
localData[
get_
local
_id
(
0
)
].z
=
tempPosq.z
;
localData[
get_
local
_id
(
0
)
].q
=
tempPosq.w
;
localData[local
AtomIndex
].x
=
tempPosq.x
;
localData[local
AtomIndex
].y
=
tempPosq.y
;
localData[local
AtomIndex
].z
=
tempPosq.z
;
localData[local
AtomIndex
].q
=
tempPosq.w
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData[
get_
local
_id
(
0
)
].fx
=
0.0f
;
localData[
get_
local
_id
(
0
)
].fy
=
0.0f
;
localData[
get_
local
_id
(
0
)
].fz
=
0.0f
;
localData[local
AtomIndex
].fx
=
0.0f
;
localData[local
AtomIndex
].fy
=
0.0f
;
localData[local
AtomIndex
].fz
=
0.0f
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Compute
the
full
set
of
interactions
in
this
tile.
...
...
@@ -199,7 +201,9 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
localData[baseLocalAtom+tj+forceBufferOffset].fz
+=
dEdR2.z
;
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
tj
=
(
tj+1
)
%
(
TILE_SIZE/2
)
;
}
...
...
platforms/opencl/src/kernels/nonbonded_nvidia.cl
View file @
e7b18ca4
...
...
@@ -79,10 +79,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
localData[get_local_id
(
0
)
].x
=
posq1.x
;
localData[get_local_id
(
0
)
].y
=
posq1.y
;
localData[get_local_id
(
0
)
].z
=
posq1.z
;
localData[get_local_id
(
0
)
].q
=
posq1.w
;
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
localData[localAtomIndex].x
=
posq1.x
;
localData[localAtomIndex].y
=
posq1.y
;
localData[localAtomIndex].z
=
posq1.z
;
localData[localAtomIndex].q
=
posq1.w
;
LOAD_LOCAL_PARAMETERS_FROM_1
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
exclusions[exclusionIndex[localGroupIndex]+tgx]
;
...
...
@@ -132,18 +133,19 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
else
{
//
This
is
an
off-diagonal
tile.
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y*TILE_SIZE
+
tgx
;
float4
tempPosq
=
posq[j]
;
localData[
get_
local
_id
(
0
)
].x
=
tempPosq.x
;
localData[
get_
local
_id
(
0
)
].y
=
tempPosq.y
;
localData[
get_
local
_id
(
0
)
].z
=
tempPosq.z
;
localData[
get_
local
_id
(
0
)
].q
=
tempPosq.w
;
localData[local
AtomIndex
].x
=
tempPosq.x
;
localData[local
AtomIndex
].y
=
tempPosq.y
;
localData[local
AtomIndex
].z
=
tempPosq.z
;
localData[local
AtomIndex
].q
=
tempPosq.w
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData[
get_
local
_id
(
0
)
].fx
=
0.0f
;
localData[
get_
local
_id
(
0
)
].fy
=
0.0f
;
localData[
get_
local
_id
(
0
)
].fz
=
0.0f
;
localData[local
AtomIndex
].fx
=
0.0f
;
localData[local
AtomIndex
].fy
=
0.0f
;
localData[local
AtomIndex
].fz
=
0.0f
;
#
ifdef
USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags[pos]
:
0xFFFFFFFF
)
;
if
(
!hasExclusions
&&
flags
!=
0xFFFFFFFF
)
{
...
...
@@ -254,7 +256,9 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
localData[tbx+tj].fy
+=
dEdR2.y
;
localData[tbx+tj].fz
+=
dEdR2.z
;
#
endif
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment