Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
4405b165
Commit
4405b165
authored
Oct 11, 2011
by
Peter Eastman
Browse files
Tony Tye's optimization to reduce local memory use
parent
69e75377
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
32 additions
and
17 deletions
+32
-17
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+3
-5
platforms/opencl/src/kernels/nonbonded_cpu.cl
platforms/opencl/src/kernels/nonbonded_cpu.cl
+1
-1
platforms/opencl/src/kernels/nonbonded_default.cl
platforms/opencl/src/kernels/nonbonded_default.cl
+25
-10
platforms/opencl/src/kernels/nonbonded_nvidia.cl
platforms/opencl/src/kernels/nonbonded_nvidia.cl
+3
-1
No files found.
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
4405b165
...
@@ -326,8 +326,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
...
@@ -326,8 +326,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
if
(
cutoff
!=
-
1.0
)
{
if
(
cutoff
!=
-
1.0
)
{
if
(
useCutoff
)
{
if
(
useCutoff
)
{
forceKernel
.
setArg
<
mm_float4
>
(
1
2
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
1
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
3
,
context
.
getInvPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
2
,
context
.
getInvPeriodicBoxSize
());
}
}
context
.
executeKernel
(
forceKernel
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
context
.
executeKernel
(
forceKernel
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
}
}
...
@@ -480,8 +480,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
...
@@ -480,8 +480,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines
[
"USE_EXCLUSIONS"
]
=
"1"
;
defines
[
"USE_EXCLUSIONS"
]
=
"1"
;
if
(
isSymmetric
)
if
(
isSymmetric
)
defines
[
"USE_SYMMETRIC"
]
=
"1"
;
defines
[
"USE_SYMMETRIC"
]
=
"1"
;
if
(
context
.
getSIMDWidth
()
==
32
)
defines
[
"NONBONDED_WORK_GROUP_SIZE"
]
=
OpenCLExpressionUtilities
::
intToString
(
forceThreadBlockSize
);
defines
[
"WARPS_PER_GROUP"
]
=
OpenCLExpressionUtilities
::
intToString
(
forceThreadBlockSize
/
OpenCLContext
::
TileSize
);
defines
[
"CUTOFF_SQUARED"
]
=
OpenCLExpressionUtilities
::
doubleToString
(
cutoff
*
cutoff
);
defines
[
"CUTOFF_SQUARED"
]
=
OpenCLExpressionUtilities
::
doubleToString
(
cutoff
*
cutoff
);
defines
[
"NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtoms
());
defines
[
"NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getPaddedNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getPaddedNumAtoms
());
...
@@ -509,7 +508,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
...
@@ -509,7 +508,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionIndices
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionIndices
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionRowIndices
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionRowIndices
->
getDeviceBuffer
());
kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
*
localDataSize
:
forceThreadBlockSize
*
localDataSize
),
NULL
);
kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
*
localDataSize
:
forceThreadBlockSize
*
localDataSize
),
NULL
);
kernel
.
setArg
(
index
++
,
4
*
forceThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
+
numTiles
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
+
numTiles
);
if
(
useCutoff
)
{
if
(
useCutoff
)
{
...
...
platforms/opencl/src/kernels/nonbonded_cpu.cl
View file @
4405b165
...
@@ -12,7 +12,7 @@ typedef struct {
...
@@ -12,7 +12,7 @@ typedef struct {
*/
*/
__kernel
void
computeNonbonded
(
__global
float4*
restrict
forceBuffers,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__kernel
void
computeNonbonded
(
__global
float4*
restrict
forceBuffers,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
__local
float4*
restrict
tempBuffer,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
...
...
platforms/opencl/src/kernels/nonbonded_default.cl
View file @
4405b165
...
@@ -11,9 +11,9 @@ typedef struct {
...
@@ -11,9 +11,9 @@ typedef struct {
*
Compute
nonbonded
interactions.
*
Compute
nonbonded
interactions.
*/
*/
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
NONBONDED_
WORK_GROUP_SIZE,
1
,
1
)))
void
computeNonbonded
(
__global
float4*
restrict
forceBuffers,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
void
computeNonbonded
(
__global
float4*
restrict
forceBuffers,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
__local
float4*
restrict
tempBuffer,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
...
@@ -31,6 +31,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -31,6 +31,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#
endif
#
endif
float
energy
=
0.0f
;
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
float
tempBuffer[3*
(
NONBONDED_WORK_GROUP_SIZE/2
)
]
;
__local
unsigned
int
exclusionRange[2]
;
__local
unsigned
int
exclusionRange[2]
;
__local
int
exclusionIndex[1]
;
__local
int
exclusionIndex[1]
;
...
@@ -124,8 +125,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -124,8 +125,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
// Sum the forces and write results.
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
int bufferIndex = 3*tgx;
tempBuffer[get_local_id(0)] = force;
if (get_local_id(0) >= TILE_SIZE) {
tempBuffer[bufferIndex] = force.x;
tempBuffer[bufferIndex+1] = force.y;
tempBuffer[bufferIndex+2] = force.z;
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...
@@ -133,7 +138,9 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -133,7 +138,9 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#else
#else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
#endif
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
float4 sum = forceBuffers[offset];
sum += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f);
forceBuffers[offset] = sum;
}
}
}
}
else {
else {
...
@@ -210,8 +217,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -210,8 +217,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
//
Sum
the
forces
and
write
results.
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
int
bufferIndex
=
3*tgx
;
tempBuffer[get_local_id
(
0
)
]
=
force
;
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
{
tempBuffer[bufferIndex]
=
force.x
;
tempBuffer[bufferIndex+1]
=
force.y
;
tempBuffer[bufferIndex+2]
=
force.z
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
...
@@ -221,9 +232,13 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -221,9 +232,13 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
#
endif
forceBuffers[offset1].xyz
=
forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
//
Cheaper
to
load/store
float4
than
float3.
Do
both
loads
before
both
stores
to
minimize
store-load
waits.
float4
sum
=
(
float4
)
(
localData[get_local_id
(
0
)
].fx+localData[get_local_id
(
0
)
+TILE_SIZE].fx,
localData[get_local_id
(
0
)
].fy+localData[get_local_id
(
0
)
+TILE_SIZE].fy,
localData[get_local_id
(
0
)
].fz+localData[get_local_id
(
0
)
+TILE_SIZE].fz,
0.0f
)
;
float4
sum1
=
forceBuffers[offset1]
;
forceBuffers[offset2].xyz
=
forceBuffers[offset2].xyz+sum.xyz
;
float4
sum2
=
forceBuffers[offset2]
;
sum1
+=
force
+
(
float4
)
(
tempBuffer[bufferIndex],
tempBuffer[bufferIndex+1],
tempBuffer[bufferIndex+2],
0.0f
)
;
sum2
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx+localData[get_local_id
(
0
)
+TILE_SIZE].fx,
localData[get_local_id
(
0
)
].fy+localData[get_local_id
(
0
)
+TILE_SIZE].fy,
localData[get_local_id
(
0
)
].fz+localData[get_local_id
(
0
)
+TILE_SIZE].fz,
0.0f
)
;
forceBuffers[offset1]
=
sum1
;
forceBuffers[offset2]
=
sum2
;
}
}
}
}
lasty
=
y
;
lasty
=
y
;
...
...
platforms/opencl/src/kernels/nonbonded_nvidia.cl
View file @
4405b165
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
#
endif
#
define
TILE_SIZE
32
#
define
TILE_SIZE
32
#
define
WARPS_PER_GROUP
(
NONBONDED_WORK_GROUP_SIZE/TILE_SIZE
)
typedef
struct
{
typedef
struct
{
float
x,
y,
z
;
float
x,
y,
z
;
...
@@ -21,7 +22,7 @@ __kernel void computeNonbonded(
...
@@ -21,7 +22,7 @@ __kernel void computeNonbonded(
__global
float4*
restrict
forceBuffers,
__global
float4*
restrict
forceBuffers,
#
endif
#
endif
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
__local
float*
restrict
tempBuffer,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
...
@@ -40,6 +41,7 @@ __kernel void computeNonbonded(
...
@@ -40,6 +41,7 @@ __kernel void computeNonbonded(
unsigned
int
end
=
startTileIndex+
(
warp+1
)
*numTiles/totalWarps
;
unsigned
int
end
=
startTileIndex+
(
warp+1
)
*numTiles/totalWarps
;
#
endif
#
endif
float
energy
=
0.0f
;
float
energy
=
0.0f
;
__local
float
tempBuffer[3*NONBONDED_WORK_GROUP_SIZE]
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment