Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
4405b165
"platforms/reference/tests/TestReferenceRGForce.cpp" did not exist on "85da5e0f9018719e95350c76b483a2160c84d9d3"
Commit
4405b165
authored
Oct 11, 2011
by
Peter Eastman
Browse files
Tony Tye's optimization to reduce local memory use
parent
69e75377
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
32 additions
and
17 deletions
+32
-17
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+3
-5
platforms/opencl/src/kernels/nonbonded_cpu.cl
platforms/opencl/src/kernels/nonbonded_cpu.cl
+1
-1
platforms/opencl/src/kernels/nonbonded_default.cl
platforms/opencl/src/kernels/nonbonded_default.cl
+25
-10
platforms/opencl/src/kernels/nonbonded_nvidia.cl
platforms/opencl/src/kernels/nonbonded_nvidia.cl
+3
-1
No files found.
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
4405b165
...
@@ -326,8 +326,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
...
@@ -326,8 +326,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
if
(
cutoff
!=
-
1.0
)
{
if
(
cutoff
!=
-
1.0
)
{
if
(
useCutoff
)
{
if
(
useCutoff
)
{
forceKernel
.
setArg
<
mm_float4
>
(
1
2
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
1
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
3
,
context
.
getInvPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
2
,
context
.
getInvPeriodicBoxSize
());
}
}
context
.
executeKernel
(
forceKernel
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
context
.
executeKernel
(
forceKernel
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
}
}
...
@@ -480,8 +480,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
...
@@ -480,8 +480,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines
[
"USE_EXCLUSIONS"
]
=
"1"
;
defines
[
"USE_EXCLUSIONS"
]
=
"1"
;
if
(
isSymmetric
)
if
(
isSymmetric
)
defines
[
"USE_SYMMETRIC"
]
=
"1"
;
defines
[
"USE_SYMMETRIC"
]
=
"1"
;
if
(
context
.
getSIMDWidth
()
==
32
)
defines
[
"NONBONDED_WORK_GROUP_SIZE"
]
=
OpenCLExpressionUtilities
::
intToString
(
forceThreadBlockSize
);
defines
[
"WARPS_PER_GROUP"
]
=
OpenCLExpressionUtilities
::
intToString
(
forceThreadBlockSize
/
OpenCLContext
::
TileSize
);
defines
[
"CUTOFF_SQUARED"
]
=
OpenCLExpressionUtilities
::
doubleToString
(
cutoff
*
cutoff
);
defines
[
"CUTOFF_SQUARED"
]
=
OpenCLExpressionUtilities
::
doubleToString
(
cutoff
*
cutoff
);
defines
[
"NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtoms
());
defines
[
"NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getPaddedNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getPaddedNumAtoms
());
...
@@ -509,7 +508,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
...
@@ -509,7 +508,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionIndices
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionIndices
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionRowIndices
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionRowIndices
->
getDeviceBuffer
());
kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
*
localDataSize
:
forceThreadBlockSize
*
localDataSize
),
NULL
);
kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
*
localDataSize
:
forceThreadBlockSize
*
localDataSize
),
NULL
);
kernel
.
setArg
(
index
++
,
4
*
forceThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
+
numTiles
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
+
numTiles
);
if
(
useCutoff
)
{
if
(
useCutoff
)
{
...
...
platforms/opencl/src/kernels/nonbonded_cpu.cl
View file @
4405b165
...
@@ -12,7 +12,7 @@ typedef struct {
...
@@ -12,7 +12,7 @@ typedef struct {
*/
*/
__kernel
void
computeNonbonded
(
__global
float4*
restrict
forceBuffers,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__kernel
void
computeNonbonded
(
__global
float4*
restrict
forceBuffers,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
__local
float4*
restrict
tempBuffer,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
...
...
platforms/opencl/src/kernels/nonbonded_default.cl
View file @
4405b165
...
@@ -11,9 +11,9 @@ typedef struct {
...
@@ -11,9 +11,9 @@ typedef struct {
*
Compute
nonbonded
interactions.
*
Compute
nonbonded
interactions.
*/
*/
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
NONBONDED_
WORK_GROUP_SIZE,
1
,
1
)))
void
computeNonbonded
(
__global
float4*
restrict
forceBuffers,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
void
computeNonbonded
(
__global
float4*
restrict
forceBuffers,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
__local
float4*
restrict
tempBuffer,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
...
@@ -31,6 +31,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -31,6 +31,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#
endif
#
endif
float
energy
=
0.0f
;
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
float
tempBuffer[3*
(
NONBONDED_WORK_GROUP_SIZE/2
)
]
;
__local
unsigned
int
exclusionRange[2]
;
__local
unsigned
int
exclusionRange[2]
;
__local
int
exclusionIndex[1]
;
__local
int
exclusionIndex[1]
;
...
@@ -124,8 +125,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -124,8 +125,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
// Sum the forces and write results.
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
int bufferIndex = 3*tgx;
tempBuffer[get_local_id(0)] = force;
if (get_local_id(0) >= TILE_SIZE) {
tempBuffer[bufferIndex] = force.x;
tempBuffer[bufferIndex+1] = force.y;
tempBuffer[bufferIndex+2] = force.z;
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...
@@ -133,7 +138,9 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -133,7 +138,9 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#else
#else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
#endif
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id(0)+TILE_SIZE].xyz;
float4 sum = forceBuffers[offset];
sum += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f);
forceBuffers[offset] = sum;
}
}
}
}
else {
else {
...
@@ -210,8 +217,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -210,8 +217,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
//
Sum
the
forces
and
write
results.
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
int
bufferIndex
=
3*tgx
;
tempBuffer[get_local_id
(
0
)
]
=
force
;
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
{
tempBuffer[bufferIndex]
=
force.x
;
tempBuffer[bufferIndex+1]
=
force.y
;
tempBuffer[bufferIndex+2]
=
force.z
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
...
@@ -221,9 +232,13 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -221,9 +232,13 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
#
endif
forceBuffers[offset1].xyz
=
forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
//
Cheaper
to
load/store
float4
than
float3.
Do
both
loads
before
both
stores
to
minimize
store-load
waits.
float4
sum
=
(
float4
)
(
localData[get_local_id
(
0
)
].fx+localData[get_local_id
(
0
)
+TILE_SIZE].fx,
localData[get_local_id
(
0
)
].fy+localData[get_local_id
(
0
)
+TILE_SIZE].fy,
localData[get_local_id
(
0
)
].fz+localData[get_local_id
(
0
)
+TILE_SIZE].fz,
0.0f
)
;
float4
sum1
=
forceBuffers[offset1]
;
forceBuffers[offset2].xyz
=
forceBuffers[offset2].xyz+sum.xyz
;
float4
sum2
=
forceBuffers[offset2]
;
sum1
+=
force
+
(
float4
)
(
tempBuffer[bufferIndex],
tempBuffer[bufferIndex+1],
tempBuffer[bufferIndex+2],
0.0f
)
;
sum2
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx+localData[get_local_id
(
0
)
+TILE_SIZE].fx,
localData[get_local_id
(
0
)
].fy+localData[get_local_id
(
0
)
+TILE_SIZE].fy,
localData[get_local_id
(
0
)
].fz+localData[get_local_id
(
0
)
+TILE_SIZE].fz,
0.0f
)
;
forceBuffers[offset1]
=
sum1
;
forceBuffers[offset2]
=
sum2
;
}
}
}
}
lasty
=
y
;
lasty
=
y
;
...
...
platforms/opencl/src/kernels/nonbonded_nvidia.cl
View file @
4405b165
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
#
endif
#
define
TILE_SIZE
32
#
define
TILE_SIZE
32
#
define
WARPS_PER_GROUP
(
NONBONDED_WORK_GROUP_SIZE/TILE_SIZE
)
typedef
struct
{
typedef
struct
{
float
x,
y,
z
;
float
x,
y,
z
;
...
@@ -21,7 +22,7 @@ __kernel void computeNonbonded(
...
@@ -21,7 +22,7 @@ __kernel void computeNonbonded(
__global
float4*
restrict
forceBuffers,
__global
float4*
restrict
forceBuffers,
#
endif
#
endif
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
__local
float*
restrict
tempBuffer,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
...
@@ -40,6 +41,7 @@ __kernel void computeNonbonded(
...
@@ -40,6 +41,7 @@ __kernel void computeNonbonded(
unsigned
int
end
=
startTileIndex+
(
warp+1
)
*numTiles/totalWarps
;
unsigned
int
end
=
startTileIndex+
(
warp+1
)
*numTiles/totalWarps
;
#
endif
#
endif
float
energy
=
0.0f
;
float
energy
=
0.0f
;
__local
float
tempBuffer[3*NONBONDED_WORK_GROUP_SIZE]
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment