Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
fed50628
"wrappers/vscode:/vscode.git/clone" did not exist on "e6b89797a681c607f9aa0ba7d4568f56a8149f27"
Commit
fed50628
authored
Jan 27, 2012
by
Peter Eastman
Browse files
Tony's optimizations to reduce local memory use
parent
2dd09317
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
174 additions
and
131 deletions
+174
-131
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+10
-12
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+11
-14
platforms/opencl/src/kernels/gbsaObc_cpu.cl
platforms/opencl/src/kernels/gbsaObc_cpu.cl
+2
-2
platforms/opencl/src/kernels/gbsaObc_default.cl
platforms/opencl/src/kernels/gbsaObc_default.cl
+81
-59
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+2
-2
platforms/opencl/src/kernels/nonbonded_cpu.cl
platforms/opencl/src/kernels/nonbonded_cpu.cl
+2
-1
platforms/opencl/src/kernels/nonbonded_default.cl
platforms/opencl/src/kernels/nonbonded_default.cl
+61
-40
platforms/opencl/src/kernels/nonbonded_nvidia.cl
platforms/opencl/src/kernels/nonbonded_nvidia.cl
+5
-1
No files found.
platforms/opencl/src/OpenCLKernels.cpp
View file @
fed50628
...
...
@@ -1563,7 +1563,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
(
useLong
?
longBornSum
->
getDeviceBuffer
()
:
bornSum
->
getDeviceBuffer
()));
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getPosq
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
params
->
getDeviceBuffer
());
computeBornSumKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
7
*
sizeof
(
cl_float
),
NULL
);
if
(
nb
.
getUseCutoff
())
{
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
...
@@ -1585,7 +1584,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getEnergyBuffer
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getPosq
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
bornRadii
->
getDeviceBuffer
());
force1Kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
9
*
sizeof
(
cl_float
),
NULL
);
if
(
nb
.
getUseCutoff
())
{
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
...
@@ -1624,19 +1622,19 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornForceKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
obcChain
->
getDeviceBuffer
());
}
if
(
nb
.
getUseCutoff
())
{
computeBornSumKernel
.
setArg
<
mm_float4
>
(
6
,
cl
.
getPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
7
,
cl
.
getInvPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
8
,
cl
.
getPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
9
,
cl
.
getInvPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
5
,
cl
.
getPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
6
,
cl
.
getInvPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
7
,
cl
.
getPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
8
,
cl
.
getInvPeriodicBoxSize
());
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl_uint
>
(
8
,
maxTiles
);
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
6
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl_uint
>
(
10
,
maxTiles
);
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl_uint
>
(
7
,
maxTiles
);
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
5
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl_uint
>
(
9
,
maxTiles
);
if
(
cl
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
{
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
9
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
1
1
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
1
0
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
}
}
}
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
fed50628
...
...
@@ -326,8 +326,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
if
(
cutoff
!=
-
1.0
)
{
if
(
useCutoff
)
{
forceKernel
.
setArg
<
mm_float4
>
(
1
1
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
2
,
context
.
getInvPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
0
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
1
,
context
.
getInvPeriodicBoxSize
());
}
context
.
executeKernel
(
forceKernel
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
}
...
...
@@ -349,14 +349,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
newSize
=
numTiles
;
delete
interactingTiles
;
interactingTiles
=
new
OpenCLArray
<
mm_ushort2
>
(
context
,
newSize
,
"interactingTiles"
);
forceKernel
.
setArg
<
cl
::
Buffer
>
(
9
,
interactingTiles
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl_uint
>
(
1
3
,
newSize
);
forceKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
interactingTiles
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl_uint
>
(
1
2
,
newSize
);
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
interactingTiles
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
9
,
newSize
);
if
(
context
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
{
delete
interactionFlags
;
interactionFlags
=
new
OpenCLArray
<
cl_uint
>
(
context
,
deviceIsCpu
?
2
*
newSize
:
newSize
,
"interactionFlags"
);
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
4
,
interactionFlags
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
3
,
interactionFlags
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
interactingTiles
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
...
...
@@ -369,22 +369,22 @@ void OpenCLNonbondedUtilities::setTileRange(int startTileIndex, int numTiles) {
this
->
numTiles
=
numTiles
;
if
(
cutoff
==
-
1.0
)
return
;
// There are no nonbonded interactions in the System.
forceKernel
.
setArg
<
cl_uint
>
(
7
,
startTileIndex
);
forceKernel
.
setArg
<
cl_uint
>
(
8
,
startTileIndex
+
numTiles
);
forceKernel
.
setArg
<
cl_uint
>
(
6
,
startTileIndex
);
forceKernel
.
setArg
<
cl_uint
>
(
7
,
startTileIndex
+
numTiles
);
if
(
useCutoff
)
{
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
10
,
startTileIndex
);
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
11
,
startTileIndex
+
numTiles
);
}
else
forceKernel
.
setArg
<
cl_uint
>
(
9
,
numTiles
);
forceKernel
.
setArg
<
cl_uint
>
(
8
,
numTiles
);
}
cl
::
Kernel
OpenCLNonbondedUtilities
::
createInteractionKernel
(
const
string
&
source
,
const
vector
<
ParameterInfo
>&
params
,
const
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
)
const
{
map
<
string
,
string
>
replacements
;
replacements
[
"COMPUTE_INTERACTION"
]
=
source
;
int
localDataSize
=
7
*
sizeof
(
cl_float
);
const
string
suffixes
[]
=
{
"x"
,
"y"
,
"z"
,
"w"
};
stringstream
localData
;
int
localDataSize
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
if
(
params
[
i
].
getNumComponents
()
==
1
)
localData
<<
params
[
i
].
getType
()
<<
" "
<<
params
[
i
].
getName
()
<<
";
\n
"
;
...
...
@@ -394,10 +394,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
}
localDataSize
+=
params
[
i
].
getSize
();
}
if
((
localDataSize
/
4
)
%
2
==
0
)
{
localData
<<
"float padding;
\n
"
;
localDataSize
+=
4
;
}
replacements
[
"ATOM_PARAMETER_DATA"
]
=
localData
.
str
();
stringstream
args
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
...
...
@@ -487,6 +483,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines
[
"NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtomBlocks
());
if
((
localDataSize
/
4
)
%
2
==
0
)
defines
[
"PARAMETER_SIZE_IS_EVEN"
]
=
"1"
;
string
file
;
if
(
deviceIsCpu
)
file
=
OpenCLKernelSources
::
nonbonded_cpu
;
...
...
@@ -509,7 +507,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusions
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionIndices
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionRowIndices
->
getDeviceBuffer
());
kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
*
localDataSize
:
forceThreadBlockSize
*
localDataSize
),
NULL
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
+
numTiles
);
if
(
useCutoff
)
{
...
...
platforms/opencl/src/kernels/gbsaObc_cpu.cl
View file @
fed50628
...
...
@@ -12,7 +12,6 @@ typedef struct {
*/
__kernel
void
computeBornSum
(
__global
float*
restrict
global_bornSum,
__global
const
float4*
restrict
posq,
__global
const
float2*
restrict
global_params,
__local
AtomData1*
restrict
localData,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
)
{
#
else
...
...
@@ -27,6 +26,7 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
AtomData1
localData[TILE_SIZE]
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
...
...
@@ -196,7 +196,6 @@ typedef struct {
__kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else
...
...
@@ -212,6 +211,7 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global
#endif
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local AtomData2 localData[TILE_SIZE];
while (pos < end) {
// Extract the coordinates of this tile
...
...
platforms/opencl/src/kernels/gbsaObc_default.cl
View file @
fed50628
...
...
@@ -2,9 +2,7 @@
typedef
struct
{
float
x,
y,
z
;
float
q
;
float
radius,
scaledRadius
;
float
bornSum
;
}
AtomData1
;
/**
...
...
@@ -13,7 +11,6 @@ typedef struct {
__kernel
__attribute__
((
reqd_work_group_size
(
FORCE_WORK_GROUP_SIZE,
1
,
1
)))
void
computeBornSum
(
__global
float*
restrict
global_bornSum,
__global
const
float4*
restrict
posq,
__global
const
float2*
restrict
global_params,
__local
AtomData1*
restrict
localData,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
)
{
#
else
...
...
@@ -28,7 +25,9 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
float
tempBuffer[FORCE_WORK_GROUP_SIZE/2]
;
__local
AtomData1
localData[TILE_SIZE]
;
__local
float
localBornSum[FORCE_WORK_GROUP_SIZE]
;
__local
float
localTemp[TILE_SIZE]
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
...
...
@@ -51,7 +50,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int
forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 :
TILE_SIZE);
unsigned int
localForceOffset = get_local_id(0) & ~(
TILE_SIZE
-1
);
unsigned int atom1 = x*TILE_SIZE + tgx;
float bornSum = 0.0f;
float4 posq1 = posq[atom1];
...
...
@@ -59,12 +58,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
if (x == y) {
// This tile is on the diagonal.
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
if (get_local_id(0) < TILE_SIZE) {
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
}
barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0.0f);
...
...
@@ -96,7 +96,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
tempBuffer
[tgx] = bornSum;
localTemp
[tgx] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...
...
@@ -104,8 +104,9 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
#else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_bornSum[offset] += bornSum+
tempBuffer
[tgx];
global_bornSum[offset] += bornSum+
localTemp
[tgx];
}
// barrier not required here as localTemp is not accessed before encountering another barrier.
}
else {
// This is an off-diagonal tile.
...
...
@@ -116,19 +117,18 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
}
local
Data
[get_local_id(0)]
.bornSum
= 0.0f;
local
BornSum
[get_local_id(0)] = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
unsigned int tj = tgx
%
(TILE_SIZE
/2
);
unsigned int tj =
(
tgx
+baseLocalAtom) &
(TILE_SIZE
-1
);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (localData[
baseLocalAtom+
tj].x-posq1.x, localData[
baseLocalAtom+
tj].y-posq1.y, localData[
baseLocalAtom+
tj].z-posq1.z, 0.0f);
float4 delta = (float4) (localData[tj].x-posq1.x, localData[tj].y-posq1.y, localData[tj].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
...
...
@@ -136,13 +136,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+
baseLocalAtom+
tj < NUM_ATOMS && r2 < CUTOFF_SQUARED);
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED);
#else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+
baseLocalAtom+
tj < NUM_ATOMS);
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS);
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
float2 params2 = (float2) (localData[
baseLocalAtom+
tj].radius, localData[
baseLocalAtom+
tj].scaledRadius);
float2 params2 = (float2) (localData[tj].radius, localData[tj].scaledRadius);
float rScaledRadiusJ = r+params2.y;
{
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
...
...
@@ -165,16 +165,16 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
term += select(0.0f, 2.0f*(RECIP(params2.x)-l_ij), params2.x < params1.y-r);
local
Data[baseLocalAtom+tj+forceBufferOffset].bornSum
+= select(0.0f, term, includeInteraction && params2.x < rScaledRadiusI);
local
BornSum[tj+localForceOffset]
+= select(0.0f, term, includeInteraction && params2.x < rScaledRadiusI);
}
barrier(CLK_LOCAL_MEM_FENCE);
tj = (tj+1)
%
(TILE_SIZE
/2
);
tj = (tj+1)
&
(TILE_SIZE
-1
);
}
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
tempBuffer
[tgx] = bornSum;
localTemp
[tgx] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...
...
@@ -187,22 +187,28 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
// Do both loads before both stores to minimize store-load waits.
float sum1 = global_bornSum[offset1];
float sum2 = global_bornSum[offset2];
sum1 += bornSum +
tempBuffer
[tgx];
sum2 += local
Data
[get_local_id(0)]
.bornSum
+ local
Data
[get_local_id(0)+TILE_SIZE]
.bornSum
;
sum1 += bornSum +
localTemp
[tgx];
sum2 += local
BornSum
[get_local_id(0)] + local
BornSum
[get_local_id(0)+TILE_SIZE];
global_bornSum[offset1] = sum1;
global_bornSum[offset2] = sum2;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
lasty = y;
pos++;
}
}
typedef struct {
float x, y, z, w;
float padding;
} PaddedUnalignedFloat4;
typedef struct {
float x, y, z;
float q;
float fx, fy, fz, fw;
float bornRadius;
float temp_x, temp_y, temp_z, temp_w;
} AtomData2;
/**
...
...
@@ -212,7 +218,6 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else
...
...
@@ -228,7 +233,8 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
#endif
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local float4 tempBuffer[FORCE_WORK_GROUP_SIZE/2];
__local AtomData2 localData[TILE_SIZE];
__local PaddedUnalignedFloat4 localForce[FORCE_WORK_GROUP_SIZE];
while (pos < end) {
// Extract the coordinates of this tile
...
...
@@ -251,7 +257,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
}
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
forceBufferOffset
=
(
tgx
<
TILE_SIZE/2
?
0
:
TILE_SIZE
)
;
unsigned
int
localForceOffset
=
get_local_id
(
0
)
&
~
(
TILE_SIZE
-1
)
;
unsigned
int
atom1
=
x*TILE_SIZE
+
tgx
;
float4
force
=
0.0f
;
float4
posq1
=
posq[atom1]
;
...
...
@@ -259,11 +265,13 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
localData[get_local_id
(
0
)
].x
=
posq1.x
;
localData[get_local_id
(
0
)
].y
=
posq1.y
;
localData[get_local_id
(
0
)
].z
=
posq1.z
;
localData[get_local_id
(
0
)
].q
=
posq1.w
;
localData[get_local_id
(
0
)
].bornRadius
=
bornRadius1
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
localData[get_local_id
(
0
)
].x
=
posq1.x
;
localData[get_local_id
(
0
)
].y
=
posq1.y
;
localData[get_local_id
(
0
)
].z
=
posq1.z
;
localData[get_local_id
(
0
)
].q
=
posq1.w
;
localData[get_local_id
(
0
)
].bornRadius
=
bornRadius1
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+baseLocalAtom+j
<
NUM_ATOMS
)
;
...
...
@@ -300,8 +308,12 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
tempBuffer[tgx]
=
force
;
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
{
localData[tgx].temp_x
=
force.x
;
localData[tgx].temp_y
=
force.y
;
localData[tgx].temp_z
=
force.z
;
localData[tgx].temp_w
=
force.w
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
...
...
@@ -312,11 +324,14 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
//
Cheaper
to
load/store
float4
than
float3.
Do
all
loads
before
all
stores
to
minimize
store-load
waits.
float4
sum
=
forceBuffers[offset]
;
float
global_sum
=
global_bornForce[offset]
;
sum.xyz
+=
force.xyz
+
tempBuffer[tgx].xyz
;
global_sum
+=
force.w
+
tempBuffer[tgx].w
;
sum.x
+=
force.x
+
localData[tgx].temp_x
;
sum.y
+=
force.y
+
localData[tgx].temp_y
;
sum.z
+=
force.z
+
localData[tgx].temp_z
;
global_sum
+=
force.w
+
localData[tgx].temp_w
;
forceBuffers[offset]
=
sum
;
global_bornForce[offset]
=
global_sum
;
}
//
barrier
not
required
here
as
localData[*]/temp_*
is
not
accessed
before
encountering
another
barrier.
}
else
{
//
This
is
an
off-diagonal
tile.
...
...
@@ -330,18 +345,18 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
localData[get_local_id
(
0
)
].q
=
tempPosq.w
;
localData[get_local_id
(
0
)
].bornRadius
=
global_bornRadii[j]
;
}
local
Data
[get_local_id
(
0
)
].
f
x
=
0.0f
;
local
Data
[get_local_id
(
0
)
].
f
y
=
0.0f
;
local
Data
[get_local_id
(
0
)
].
f
z
=
0.0f
;
local
Data
[get_local_id
(
0
)
].
f
w
=
0.0f
;
local
Force
[get_local_id
(
0
)
].x
=
0.0f
;
local
Force
[get_local_id
(
0
)
].y
=
0.0f
;
local
Force
[get_local_id
(
0
)
].z
=
0.0f
;
local
Force
[get_local_id
(
0
)
].w
=
0.0f
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
tj
=
tgx
%
(
TILE_SIZE
/2
)
;
unsigned
int
tj
=
(
tgx
+baseLocalAtom
)
&
(
TILE_SIZE
-1
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+
baseLocalAtom+
tj
<
NUM_ATOMS
)
;
float4
posq2
=
(
float4
)
(
localData[
baseLocalAtom+
tj].x,
localData[
baseLocalAtom+
tj].y,
localData[
baseLocalAtom+
tj].z,
localData[
baseLocalAtom+
tj].q
)
;
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+tj
<
NUM_ATOMS
)
;
float4
posq2
=
(
float4
)
(
localData[tj].x,
localData[tj].y,
localData[tj].z,
localData[tj].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
...
...
@@ -351,7 +366,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
float
bornRadius2
=
localData[
baseLocalAtom+
tj].bornRadius
;
float
bornRadius2
=
localData[tj].bornRadius
;
float
alpha2_ij
=
bornRadius1*bornRadius2
;
float
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
float
expTerm
=
EXP
(
-D_ij
)
;
...
...
@@ -370,18 +385,22 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
energy
+=
select
(
0.0f,
tempEnergy,
includeInteraction
)
;
delta.xyz
*=
select
(
0.0f,
dEdR,
includeInteraction
)
;
force.xyz
-=
delta.xyz
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
x
+=
delta.x
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
y
+=
delta.y
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
z
+=
delta.z
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
w
+=
select
(
0.0f,
dGpol_dalpha2_ij*bornRadius1,
includeInteraction
)
;
local
Force[tj+localForce
Offset].x
+=
delta.x
;
local
Force[tj+localForce
Offset].y
+=
delta.y
;
local
Force[tj+localForce
Offset].z
+=
delta.z
;
local
Force[tj+localForce
Offset].w
+=
select
(
0.0f,
dGpol_dalpha2_ij*bornRadius1,
includeInteraction
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
tj
=
(
tj+1
)
%
(
TILE_SIZE
/2
)
;
tj
=
(
tj+1
)
&
(
TILE_SIZE
-1
)
;
}
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
tempBuffer[tgx]
=
force
;
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
{
localData[tgx].temp_x
=
force.x
;
localData[tgx].temp_y
=
force.y
;
localData[tgx].temp_z
=
force.z
;
localData[tgx].temp_w
=
force.w
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
...
...
@@ -396,17 +415,20 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
float4
sum2
=
forceBuffers[offset2]
;
float
global_sum1
=
global_bornForce[offset1]
;
float
global_sum2
=
global_bornForce[offset2]
;
sum1.xyz
+=
force.xyz
+
tempBuffer[tgx].xyz
;
global_sum1
+=
force.w
+
tempBuffer[tgx].w
;
sum2.x
+=
localData[get_local_id
(
0
)
].fx
+
localData[get_local_id
(
0
)
+TILE_SIZE].fx
;
sum2.y
+=
localData[get_local_id
(
0
)
].fy
+
localData[get_local_id
(
0
)
+TILE_SIZE].fy
;
sum2.z
+=
localData[get_local_id
(
0
)
].fz
+
localData[get_local_id
(
0
)
+TILE_SIZE].fz
;
global_sum2
+=
localData[get_local_id
(
0
)
].fw
+
localData[get_local_id
(
0
)
+TILE_SIZE].fw
;
sum1.x
+=
force.x
+
localData[tgx].temp_x
;
sum1.y
+=
force.y
+
localData[tgx].temp_y
;
sum1.z
+=
force.z
+
localData[tgx].temp_z
;
global_sum1
+=
force.w
+
localData[tgx].temp_w
;
sum2.x
+=
localForce[get_local_id
(
0
)
].x
+
localForce[get_local_id
(
0
)
+TILE_SIZE].x
;
sum2.y
+=
localForce[get_local_id
(
0
)
].y
+
localForce[get_local_id
(
0
)
+TILE_SIZE].y
;
sum2.z
+=
localForce[get_local_id
(
0
)
].z
+
localForce[get_local_id
(
0
)
+TILE_SIZE].z
;
global_sum2
+=
localForce[get_local_id
(
0
)
].w
+
localForce[get_local_id
(
0
)
+TILE_SIZE].w
;
forceBuffers[offset1]
=
sum1
;
forceBuffers[offset2]
=
sum2
;
global_bornForce[offset1]
=
global_sum1
;
global_bornForce[offset2]
=
global_sum2
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
lasty
=
y
;
pos++
;
...
...
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
View file @
fed50628
...
...
@@ -22,7 +22,6 @@ __kernel void computeBornSum(
__global
float*
restrict
global_bornSum,
#
endif
__global
const
float4*
restrict
posq,
__global
const
float2*
restrict
global_params,
__local
AtomData1*
restrict
localData,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags,
#
else
...
...
@@ -40,6 +39,7 @@ __kernel void computeBornSum(
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
#
endif
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
AtomData1
localData[FORCE_WORK_GROUP_SIZE]
;
__local
float
tempBuffer[FORCE_WORK_GROUP_SIZE]
;
__local
int2
reservedBlocks[WARPS_PER_GROUP]
;
__local
unsigned
int*
exclusionRange
=
(
__local
unsigned
int*
)
reservedBlocks
;
...
...
@@ -344,7 +344,6 @@ __kernel void computeGBSAForce1(
__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
#endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
#else
...
...
@@ -363,6 +362,7 @@ __kernel void computeGBSAForce1(
#endif
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local AtomData2 localData[FORCE_WORK_GROUP_SIZE];
__local float4 tempBuffer[FORCE_WORK_GROUP_SIZE];
__local int2 reservedBlocks[WARPS_PER_GROUP];
__local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
...
...
platforms/opencl/src/kernels/nonbonded_cpu.cl
View file @
fed50628
...
...
@@ -12,7 +12,7 @@ typedef struct {
*/
__kernel
void
computeNonbonded
(
__global
float4*
restrict
forceBuffers,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
...
...
@@ -30,6 +30,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
#
endif
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
AtomData
localData[TILE_SIZE]
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
...
...
platforms/opencl/src/kernels/nonbonded_default.cl
View file @
fed50628
#
define
TILE_SIZE
32
//
Cannot
use
float3
as
OpenCL
defines
it
to
be
4
DWORD
aligned.
This
would
//
cause
every
element
of
array
to
have
DWORD
of
padding
to
make
it
4
DWORD
//
aligned
which
wastes
space
and
causes
LDS
bank
conflicts
as
stride
is
no
//
longer
odd
DWORDS.
typedef
struct
{
float
x,
y,
z
;
}
UnalignedFloat3
;
typedef
struct
{
float
x,
y,
z
;
float
q
;
float
fx,
fy,
fz
;
ATOM_PARAMETER_DATA
#
ifndef
PARAMETER_SIZE_IS_EVEN
float
padding
;
#
endif
}
AtomData
;
/**
...
...
@@ -13,7 +24,7 @@ typedef struct {
__kernel
__attribute__
((
reqd_work_group_size
(
FORCE_WORK_GROUP_SIZE,
1
,
1
)))
void
computeNonbonded
(
__global
float4*
restrict
forceBuffers,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
...
...
@@ -31,9 +42,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#
endif
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
float
tempBuffer[3*
(
FORCE_WORK_GROUP_SIZE/2
)
]
;
__local
AtomData
localData[TILE_SIZE]
;
__local
UnalignedFloat3
localForce[FORCE_WORK_GROUP_SIZE]
;
#
ifdef
USE_EXCLUSIONS
__local
unsigned
int
exclusionRange[2]
;
__local
int
exclusionIndex[1]
;
#
endif
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
...
...
@@ -56,7 +70,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int
forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 :
TILE_SIZE);
unsigned int
localForceOffset = get_local_id(0) & ~(
TILE_SIZE
-1
);
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
...
...
@@ -79,12 +93,14 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
if (get_local_id(0) < TILE_SIZE) {
const unsigned int localAtomIndex = tgx;
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
}
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
...
...
@@ -93,7 +109,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = baseLocalAtom+j;
unsigned
int atom2 = baseLocalAtom+j;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
...
...
@@ -125,14 +141,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
// Sum the forces and write results.
int bufferIndex = 3*tgx;
if (get_local_id(0) >= TILE_SIZE) {
tempBuffer[bufferIndex]
= force.x;
tempBuffer[bufferIndex+1]
= force.y;
tempBuffer[bufferIndex+2]
= force.z;
localData[tgx].fx
= force.x;
localData[tgx].fy
= force.y;
localData[tgx].fz
= force.z;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
force.x += localData[tgx].fx;
force.y += localData[tgx].fy;
force.z += localData[tgx].fz;
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
...
...
@@ -140,15 +158,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#endif
// Cheaper to load/store float4 than float3.
float4 sum = forceBuffers[offset];
sum += force
+ (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f)
;
sum
.xyz
+= force
.xyz
;
forceBuffers[offset] = sum;
}
// barrier not required here as localData[*].temp is not accessed before encountering another barrier.
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex =
get_local_id(0)
;
if (lasty != y &&
localAtomIndex
< TILE_SIZE) {
if (lasty != y &&
get_local_id(0)
< TILE_SIZE) {
const unsigned int
localAtomIndex
= tgx;
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
...
...
@@ -157,26 +176,23 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local
Data[localAtomIndex
].
f
x = 0.0f;
local
Data[localAtomIndex
].
f
y = 0.0f;
local
Data[localAtomIndex
].
f
z = 0.0f;
local
Force[get_local_id(0)
].x = 0.0f;
local
Force[get_local_id(0)
].y = 0.0f;
local
Force[get_local_id(0)
].z = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
excl = (excl >> baseLocalAtom) & 0xFFFF;
excl += excl << 16;
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
excl = (excl >> tj) |
(
excl
<<
(
TILE_SIZE
-
tj
))
;
#
endif
unsigned
int
tj
=
tgx%
(
TILE_SIZE/2
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
int
atom2
=
baseLocalAtom+tj
;
float4
posq2
=
(
float4
)
(
localData[atom2].x,
localData[atom2].y,
localData[atom2].z,
localData[atom2].q
)
;
float4
posq2
=
(
float4
)
(
localData[tj].x,
localData[tj].y,
localData[tj].z,
localData[tj].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
...
...
@@ -186,8 +202,9 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
int
atom2
=
tj
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+
baseLocalAtom+
tj
;
atom2
=
y*TILE_SIZE+tj
;
#
ifdef
USE_SYMMETRIC
float
dEdR
=
0.0f
;
#
else
...
...
@@ -200,29 +217,28 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#
ifdef
USE_SYMMETRIC
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
x
+=
delta.x
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
y
+=
delta.y
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
z
+=
delta.z
;
local
Force[tj+localForce
Offset].x
+=
delta.x
;
local
Force[tj+localForce
Offset].y
+=
delta.y
;
local
Force[tj+localForce
Offset].z
+=
delta.z
;
#
else
force.xyz
-=
dEdR1.xyz
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
x
+=
dEdR2.x
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
y
+=
dEdR2.y
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
z
+=
dEdR2.z
;
local
Force[tj+localForce
Offset].x
+=
dEdR2.x
;
local
Force[tj+localForce
Offset].y
+=
dEdR2.y
;
local
Force[tj+localForce
Offset].z
+=
dEdR2.z
;
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
tj
=
(
tj+1
)
%
(
TILE_SIZE
/2
)
;
tj
=
(
tj+1
)
&
(
TILE_SIZE
-1
)
;
}
//
Sum
the
forces
and
write
results.
int
bufferIndex
=
3*tgx
;
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
{
tempBuffer[bufferIndex]
=
force.x
;
tempBuffer[bufferIndex+1]
=
force.y
;
tempBuffer[bufferIndex+2]
=
force.z
;
localData[tgx].fx
=
force.x
;
localData[tgx].fy
=
force.y
;
localData[tgx].fz
=
force.z
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
...
...
@@ -236,11 +252,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
//
Cheaper
to
load/store
float4
than
float3.
Do
all
loads
before
all
stores
to
minimize
store-load
waits.
float4
sum1
=
forceBuffers[offset1]
;
float4
sum2
=
forceBuffers[offset2]
;
sum1
+=
force
+
(
float4
)
(
tempBuffer[bufferIndex],
tempBuffer[bufferIndex+1],
tempBuffer[bufferIndex+2],
0.0f
)
;
sum2
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx+localData[get_local_id
(
0
)
+TILE_SIZE].fx,
localData[get_local_id
(
0
)
].fy+localData[get_local_id
(
0
)
+TILE_SIZE].fy,
localData[get_local_id
(
0
)
].fz+localData[get_local_id
(
0
)
+TILE_SIZE].fz,
0.0f
)
;
sum1.x
+=
localData[tgx].fx
+
force.x
;
sum1.y
+=
localData[tgx].fy
+
force.y
;
sum1.z
+=
localData[tgx].fz
+
force.z
;
sum2.x
+=
localForce[tgx].x
+
localForce[tgx+TILE_SIZE].x
;
sum2.y
+=
localForce[tgx].y
+
localForce[tgx+TILE_SIZE].y
;
sum2.z
+=
localForce[tgx].z
+
localForce[tgx+TILE_SIZE].z
;
forceBuffers[offset1]
=
sum1
;
forceBuffers[offset2]
=
sum2
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
lasty
=
y
;
pos++
;
...
...
platforms/opencl/src/kernels/nonbonded_nvidia.cl
View file @
fed50628
...
...
@@ -10,6 +10,9 @@ typedef struct {
float
q
;
float
fx,
fy,
fz
;
ATOM_PARAMETER_DATA
#
ifndef
PARAMETER_SIZE_IS_EVEN
float
padding
;
#
endif
}
AtomData
;
/**
...
...
@@ -22,7 +25,7 @@ __kernel void computeNonbonded(
__global
float4*
restrict
forceBuffers,
#
endif
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
...
...
@@ -41,6 +44,7 @@ __kernel void computeNonbonded(
unsigned
int
end
=
startTileIndex+
(
warp+1
)
*numTiles/totalWarps
;
#
endif
float
energy
=
0.0f
;
__local
AtomData
localData[FORCE_WORK_GROUP_SIZE]
;
__local
float
tempBuffer[3*FORCE_WORK_GROUP_SIZE]
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment