Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
fed50628
Commit
fed50628
authored
Jan 27, 2012
by
Peter Eastman
Browse files
Tony's optimizations to reduce local memory use
parent
2dd09317
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
174 additions
and
131 deletions
+174
-131
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+10
-12
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+11
-14
platforms/opencl/src/kernels/gbsaObc_cpu.cl
platforms/opencl/src/kernels/gbsaObc_cpu.cl
+2
-2
platforms/opencl/src/kernels/gbsaObc_default.cl
platforms/opencl/src/kernels/gbsaObc_default.cl
+81
-59
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+2
-2
platforms/opencl/src/kernels/nonbonded_cpu.cl
platforms/opencl/src/kernels/nonbonded_cpu.cl
+2
-1
platforms/opencl/src/kernels/nonbonded_default.cl
platforms/opencl/src/kernels/nonbonded_default.cl
+61
-40
platforms/opencl/src/kernels/nonbonded_nvidia.cl
platforms/opencl/src/kernels/nonbonded_nvidia.cl
+5
-1
No files found.
platforms/opencl/src/OpenCLKernels.cpp
View file @
fed50628
...
@@ -1563,7 +1563,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1563,7 +1563,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
(
useLong
?
longBornSum
->
getDeviceBuffer
()
:
bornSum
->
getDeviceBuffer
()));
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
(
useLong
?
longBornSum
->
getDeviceBuffer
()
:
bornSum
->
getDeviceBuffer
()));
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getPosq
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getPosq
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
params
->
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
params
->
getDeviceBuffer
());
computeBornSumKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
7
*
sizeof
(
cl_float
),
NULL
);
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
@@ -1585,7 +1584,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1585,7 +1584,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getEnergyBuffer
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getEnergyBuffer
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getPosq
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getPosq
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
bornRadii
->
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
bornRadii
->
getDeviceBuffer
());
force1Kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
9
*
sizeof
(
cl_float
),
NULL
);
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
@@ -1624,19 +1622,19 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1624,19 +1622,19 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornForceKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
obcChain
->
getDeviceBuffer
());
reduceBornForceKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
obcChain
->
getDeviceBuffer
());
}
}
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
computeBornSumKernel
.
setArg
<
mm_float4
>
(
6
,
cl
.
getPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
5
,
cl
.
getPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
7
,
cl
.
getInvPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
6
,
cl
.
getInvPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
8
,
cl
.
getPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
7
,
cl
.
getPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
9
,
cl
.
getInvPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
8
,
cl
.
getInvPeriodicBoxSize
());
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl_uint
>
(
8
,
maxTiles
);
computeBornSumKernel
.
setArg
<
cl_uint
>
(
7
,
maxTiles
);
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
6
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
5
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl_uint
>
(
10
,
maxTiles
);
force1Kernel
.
setArg
<
cl_uint
>
(
9
,
maxTiles
);
if
(
cl
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
{
if
(
cl
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
{
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
9
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
1
1
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
1
0
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
}
}
}
}
}
}
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
fed50628
...
@@ -326,8 +326,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
...
@@ -326,8 +326,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
if
(
cutoff
!=
-
1.0
)
{
if
(
cutoff
!=
-
1.0
)
{
if
(
useCutoff
)
{
if
(
useCutoff
)
{
forceKernel
.
setArg
<
mm_float4
>
(
1
1
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
0
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
2
,
context
.
getInvPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
1
,
context
.
getInvPeriodicBoxSize
());
}
}
context
.
executeKernel
(
forceKernel
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
context
.
executeKernel
(
forceKernel
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
}
}
...
@@ -349,14 +349,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
...
@@ -349,14 +349,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
newSize
=
numTiles
;
newSize
=
numTiles
;
delete
interactingTiles
;
delete
interactingTiles
;
interactingTiles
=
new
OpenCLArray
<
mm_ushort2
>
(
context
,
newSize
,
"interactingTiles"
);
interactingTiles
=
new
OpenCLArray
<
mm_ushort2
>
(
context
,
newSize
,
"interactingTiles"
);
forceKernel
.
setArg
<
cl
::
Buffer
>
(
9
,
interactingTiles
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
interactingTiles
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl_uint
>
(
1
3
,
newSize
);
forceKernel
.
setArg
<
cl_uint
>
(
1
2
,
newSize
);
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
interactingTiles
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
interactingTiles
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
9
,
newSize
);
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
9
,
newSize
);
if
(
context
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
{
if
(
context
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
{
delete
interactionFlags
;
delete
interactionFlags
;
interactionFlags
=
new
OpenCLArray
<
cl_uint
>
(
context
,
deviceIsCpu
?
2
*
newSize
:
newSize
,
"interactionFlags"
);
interactionFlags
=
new
OpenCLArray
<
cl_uint
>
(
context
,
deviceIsCpu
?
2
*
newSize
:
newSize
,
"interactionFlags"
);
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
4
,
interactionFlags
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
3
,
interactionFlags
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
interactingTiles
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
interactingTiles
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
...
@@ -369,22 +369,22 @@ void OpenCLNonbondedUtilities::setTileRange(int startTileIndex, int numTiles) {
...
@@ -369,22 +369,22 @@ void OpenCLNonbondedUtilities::setTileRange(int startTileIndex, int numTiles) {
this
->
numTiles
=
numTiles
;
this
->
numTiles
=
numTiles
;
if
(
cutoff
==
-
1.0
)
if
(
cutoff
==
-
1.0
)
return
;
// There are no nonbonded interactions in the System.
return
;
// There are no nonbonded interactions in the System.
forceKernel
.
setArg
<
cl_uint
>
(
7
,
startTileIndex
);
forceKernel
.
setArg
<
cl_uint
>
(
6
,
startTileIndex
);
forceKernel
.
setArg
<
cl_uint
>
(
8
,
startTileIndex
+
numTiles
);
forceKernel
.
setArg
<
cl_uint
>
(
7
,
startTileIndex
+
numTiles
);
if
(
useCutoff
)
{
if
(
useCutoff
)
{
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
10
,
startTileIndex
);
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
10
,
startTileIndex
);
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
11
,
startTileIndex
+
numTiles
);
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
11
,
startTileIndex
+
numTiles
);
}
}
else
else
forceKernel
.
setArg
<
cl_uint
>
(
9
,
numTiles
);
forceKernel
.
setArg
<
cl_uint
>
(
8
,
numTiles
);
}
}
cl
::
Kernel
OpenCLNonbondedUtilities
::
createInteractionKernel
(
const
string
&
source
,
const
vector
<
ParameterInfo
>&
params
,
const
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
)
const
{
cl
::
Kernel
OpenCLNonbondedUtilities
::
createInteractionKernel
(
const
string
&
source
,
const
vector
<
ParameterInfo
>&
params
,
const
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
)
const
{
map
<
string
,
string
>
replacements
;
map
<
string
,
string
>
replacements
;
replacements
[
"COMPUTE_INTERACTION"
]
=
source
;
replacements
[
"COMPUTE_INTERACTION"
]
=
source
;
int
localDataSize
=
7
*
sizeof
(
cl_float
);
const
string
suffixes
[]
=
{
"x"
,
"y"
,
"z"
,
"w"
};
const
string
suffixes
[]
=
{
"x"
,
"y"
,
"z"
,
"w"
};
stringstream
localData
;
stringstream
localData
;
int
localDataSize
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
if
(
params
[
i
].
getNumComponents
()
==
1
)
if
(
params
[
i
].
getNumComponents
()
==
1
)
localData
<<
params
[
i
].
getType
()
<<
" "
<<
params
[
i
].
getName
()
<<
";
\n
"
;
localData
<<
params
[
i
].
getType
()
<<
" "
<<
params
[
i
].
getName
()
<<
";
\n
"
;
...
@@ -394,10 +394,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
...
@@ -394,10 +394,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
}
}
localDataSize
+=
params
[
i
].
getSize
();
localDataSize
+=
params
[
i
].
getSize
();
}
}
if
((
localDataSize
/
4
)
%
2
==
0
)
{
localData
<<
"float padding;
\n
"
;
localDataSize
+=
4
;
}
replacements
[
"ATOM_PARAMETER_DATA"
]
=
localData
.
str
();
replacements
[
"ATOM_PARAMETER_DATA"
]
=
localData
.
str
();
stringstream
args
;
stringstream
args
;
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
{
...
@@ -487,6 +483,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
...
@@ -487,6 +483,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines
[
"NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtoms
());
defines
[
"NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getPaddedNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtomBlocks
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtomBlocks
());
if
((
localDataSize
/
4
)
%
2
==
0
)
defines
[
"PARAMETER_SIZE_IS_EVEN"
]
=
"1"
;
string
file
;
string
file
;
if
(
deviceIsCpu
)
if
(
deviceIsCpu
)
file
=
OpenCLKernelSources
::
nonbonded_cpu
;
file
=
OpenCLKernelSources
::
nonbonded_cpu
;
...
@@ -509,7 +507,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
...
@@ -509,7 +507,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusions
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusions
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionIndices
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionIndices
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionRowIndices
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionRowIndices
->
getDeviceBuffer
());
kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
*
localDataSize
:
forceThreadBlockSize
*
localDataSize
),
NULL
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
+
numTiles
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
+
numTiles
);
if
(
useCutoff
)
{
if
(
useCutoff
)
{
...
...
platforms/opencl/src/kernels/gbsaObc_cpu.cl
View file @
fed50628
...
@@ -12,7 +12,6 @@ typedef struct {
...
@@ -12,7 +12,6 @@ typedef struct {
*/
*/
__kernel
void
computeBornSum
(
__global
float*
restrict
global_bornSum,
__global
const
float4*
restrict
posq,
__global
const
float2*
restrict
global_params,
__kernel
void
computeBornSum
(
__global
float*
restrict
global_bornSum,
__global
const
float4*
restrict
posq,
__global
const
float2*
restrict
global_params,
__local
AtomData1*
restrict
localData,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
)
{
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
)
{
#
else
#
else
...
@@ -27,6 +26,7 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c
...
@@ -27,6 +26,7 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
#
endif
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
AtomData1
localData[TILE_SIZE]
;
while
(
pos
<
end
)
{
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
...
@@ -196,7 +196,6 @@ typedef struct {
...
@@ -196,7 +196,6 @@ typedef struct {
__kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else
#else
...
@@ -212,6 +211,7 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global
...
@@ -212,6 +211,7 @@ __kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global
#endif
#endif
float energy = 0.0f;
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
unsigned int lasty = 0xFFFFFFFF;
__local AtomData2 localData[TILE_SIZE];
while (pos < end) {
while (pos < end) {
// Extract the coordinates of this tile
// Extract the coordinates of this tile
...
...
platforms/opencl/src/kernels/gbsaObc_default.cl
View file @
fed50628
...
@@ -2,9 +2,7 @@
...
@@ -2,9 +2,7 @@
typedef
struct
{
typedef
struct
{
float
x,
y,
z
;
float
x,
y,
z
;
float
q
;
float
radius,
scaledRadius
;
float
radius,
scaledRadius
;
float
bornSum
;
}
AtomData1
;
}
AtomData1
;
/**
/**
...
@@ -13,7 +11,6 @@ typedef struct {
...
@@ -13,7 +11,6 @@ typedef struct {
__kernel
__attribute__
((
reqd_work_group_size
(
FORCE_WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
FORCE_WORK_GROUP_SIZE,
1
,
1
)))
void
computeBornSum
(
__global
float*
restrict
global_bornSum,
__global
const
float4*
restrict
posq,
__global
const
float2*
restrict
global_params,
void
computeBornSum
(
__global
float*
restrict
global_bornSum,
__global
const
float4*
restrict
posq,
__global
const
float2*
restrict
global_params,
__local
AtomData1*
restrict
localData,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
)
{
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
)
{
#
else
#
else
...
@@ -28,7 +25,9 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
...
@@ -28,7 +25,9 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
#
endif
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
float
tempBuffer[FORCE_WORK_GROUP_SIZE/2]
;
__local
AtomData1
localData[TILE_SIZE]
;
__local
float
localBornSum[FORCE_WORK_GROUP_SIZE]
;
__local
float
localTemp[TILE_SIZE]
;
while
(
pos
<
end
)
{
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
...
@@ -51,7 +50,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
...
@@ -51,7 +50,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
}
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int
forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 :
TILE_SIZE);
unsigned int
localForceOffset = get_local_id(0) & ~(
TILE_SIZE
-1
);
unsigned int atom1 = x*TILE_SIZE + tgx;
unsigned int atom1 = x*TILE_SIZE + tgx;
float bornSum = 0.0f;
float bornSum = 0.0f;
float4 posq1 = posq[atom1];
float4 posq1 = posq[atom1];
...
@@ -59,12 +58,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
...
@@ -59,12 +58,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
if (x == y) {
if (x == y) {
// This tile is on the diagonal.
// This tile is on the diagonal.
if (get_local_id(0) < TILE_SIZE) {
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
localData[get_local_id(0)].scaledRadius = params1.y;
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0.0f);
float4 delta = (float4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0.0f);
...
@@ -96,7 +96,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
...
@@ -96,7 +96,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
// Sum the forces and write results.
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
if (get_local_id(0) >= TILE_SIZE)
tempBuffer
[tgx] = bornSum;
localTemp
[tgx] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...
@@ -104,8 +104,9 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
...
@@ -104,8 +104,9 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
#else
#else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
#endif
global_bornSum[offset] += bornSum+
tempBuffer
[tgx];
global_bornSum[offset] += bornSum+
localTemp
[tgx];
}
}
// barrier not required here as localTemp is not accessed before encountering another barrier.
}
}
else {
else {
// This is an off-diagonal tile.
// This is an off-diagonal tile.
...
@@ -116,19 +117,18 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
...
@@ -116,19 +117,18 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
float2 tempParams = global_params[j];
float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
localData[get_local_id(0)].scaledRadius = tempParams.y;
}
}
local
Data
[get_local_id(0)]
.bornSum
= 0.0f;
local
BornSum
[get_local_id(0)] = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
// Compute the full set of interactions in this tile.
unsigned int tj = tgx
%
(TILE_SIZE
/2
);
unsigned int tj =
(
tgx
+baseLocalAtom) &
(TILE_SIZE
-1
);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
float4 delta = (float4) (localData[
baseLocalAtom+
tj].x-posq1.x, localData[
baseLocalAtom+
tj].y-posq1.y, localData[
baseLocalAtom+
tj].z-posq1.z, 0.0f);
float4 delta = (float4) (localData[tj].x-posq1.x, localData[tj].y-posq1.y, localData[tj].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
...
@@ -136,13 +136,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
...
@@ -136,13 +136,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
#endif
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+
baseLocalAtom+
tj < NUM_ATOMS && r2 < CUTOFF_SQUARED);
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED);
#else
#else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+
baseLocalAtom+
tj < NUM_ATOMS);
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS);
#endif
#endif
float invR = RSQRT(r2);
float invR = RSQRT(r2);
float r = RECIP(invR);
float r = RECIP(invR);
float2 params2 = (float2) (localData[
baseLocalAtom+
tj].radius, localData[
baseLocalAtom+
tj].scaledRadius);
float2 params2 = (float2) (localData[tj].radius, localData[tj].scaledRadius);
float rScaledRadiusJ = r+params2.y;
float rScaledRadiusJ = r+params2.y;
{
{
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
...
@@ -165,16 +165,16 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
...
@@ -165,16 +165,16 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
term += select(0.0f, 2.0f*(RECIP(params2.x)-l_ij), params2.x < params1.y-r);
term += select(0.0f, 2.0f*(RECIP(params2.x)-l_ij), params2.x < params1.y-r);
local
Data[baseLocalAtom+tj+forceBufferOffset].bornSum
+= select(0.0f, term, includeInteraction && params2.x < rScaledRadiusI);
local
BornSum[tj+localForceOffset]
+= select(0.0f, term, includeInteraction && params2.x < rScaledRadiusI);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
tj = (tj+1)
%
(TILE_SIZE
/2
);
tj = (tj+1)
&
(TILE_SIZE
-1
);
}
}
// Sum the forces and write results.
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
if (get_local_id(0) >= TILE_SIZE)
tempBuffer
[tgx] = bornSum;
localTemp
[tgx] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...
@@ -187,22 +187,28 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
...
@@ -187,22 +187,28 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
// Do both loads before both stores to minimize store-load waits.
// Do both loads before both stores to minimize store-load waits.
float sum1 = global_bornSum[offset1];
float sum1 = global_bornSum[offset1];
float sum2 = global_bornSum[offset2];
float sum2 = global_bornSum[offset2];
sum1 += bornSum +
tempBuffer
[tgx];
sum1 += bornSum +
localTemp
[tgx];
sum2 += local
Data
[get_local_id(0)]
.bornSum
+ local
Data
[get_local_id(0)+TILE_SIZE]
.bornSum
;
sum2 += local
BornSum
[get_local_id(0)] + local
BornSum
[get_local_id(0)+TILE_SIZE];
global_bornSum[offset1] = sum1;
global_bornSum[offset1] = sum1;
global_bornSum[offset2] = sum2;
global_bornSum[offset2] = sum2;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
lasty = y;
lasty = y;
pos++;
pos++;
}
}
}
}
typedef struct {
float x, y, z, w;
float padding;
} PaddedUnalignedFloat4;
typedef struct {
typedef struct {
float x, y, z;
float x, y, z;
float q;
float q;
float fx, fy, fz, fw;
float bornRadius;
float bornRadius;
float temp_x, temp_y, temp_z, temp_w;
} AtomData2;
} AtomData2;
/**
/**
...
@@ -212,7 +218,6 @@ typedef struct {
...
@@ -212,7 +218,6 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else
#else
...
@@ -228,7 +233,8 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
...
@@ -228,7 +233,8 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
#endif
#endif
float energy = 0.0f;
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
unsigned int lasty = 0xFFFFFFFF;
__local float4 tempBuffer[FORCE_WORK_GROUP_SIZE/2];
__local AtomData2 localData[TILE_SIZE];
__local PaddedUnalignedFloat4 localForce[FORCE_WORK_GROUP_SIZE];
while (pos < end) {
while (pos < end) {
// Extract the coordinates of this tile
// Extract the coordinates of this tile
...
@@ -251,7 +257,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
...
@@ -251,7 +257,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
}
}
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
forceBufferOffset
=
(
tgx
<
TILE_SIZE/2
?
0
:
TILE_SIZE
)
;
unsigned
int
localForceOffset
=
get_local_id
(
0
)
&
~
(
TILE_SIZE
-1
)
;
unsigned
int
atom1
=
x*TILE_SIZE
+
tgx
;
unsigned
int
atom1
=
x*TILE_SIZE
+
tgx
;
float4
force
=
0.0f
;
float4
force
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float4
posq1
=
posq[atom1]
;
...
@@ -259,11 +265,13 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
...
@@ -259,11 +265,13 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
if
(
x
==
y
)
{
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
//
This
tile
is
on
the
diagonal.
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
localData[get_local_id
(
0
)
].x
=
posq1.x
;
localData[get_local_id
(
0
)
].x
=
posq1.x
;
localData[get_local_id
(
0
)
].y
=
posq1.y
;
localData[get_local_id
(
0
)
].y
=
posq1.y
;
localData[get_local_id
(
0
)
].z
=
posq1.z
;
localData[get_local_id
(
0
)
].z
=
posq1.z
;
localData[get_local_id
(
0
)
].q
=
posq1.w
;
localData[get_local_id
(
0
)
].q
=
posq1.w
;
localData[get_local_id
(
0
)
].bornRadius
=
bornRadius1
;
localData[get_local_id
(
0
)
].bornRadius
=
bornRadius1
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+baseLocalAtom+j
<
NUM_ATOMS
)
;
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+baseLocalAtom+j
<
NUM_ATOMS
)
;
...
@@ -300,8 +308,12 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
...
@@ -300,8 +308,12 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
//
Sum
the
forces
and
write
results.
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
{
tempBuffer[tgx]
=
force
;
localData[tgx].temp_x
=
force.x
;
localData[tgx].temp_y
=
force.y
;
localData[tgx].temp_z
=
force.z
;
localData[tgx].temp_w
=
force.w
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
...
@@ -312,11 +324,14 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
...
@@ -312,11 +324,14 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
//
Cheaper
to
load/store
float4
than
float3.
Do
all
loads
before
all
stores
to
minimize
store-load
waits.
//
Cheaper
to
load/store
float4
than
float3.
Do
all
loads
before
all
stores
to
minimize
store-load
waits.
float4
sum
=
forceBuffers[offset]
;
float4
sum
=
forceBuffers[offset]
;
float
global_sum
=
global_bornForce[offset]
;
float
global_sum
=
global_bornForce[offset]
;
sum.xyz
+=
force.xyz
+
tempBuffer[tgx].xyz
;
sum.x
+=
force.x
+
localData[tgx].temp_x
;
global_sum
+=
force.w
+
tempBuffer[tgx].w
;
sum.y
+=
force.y
+
localData[tgx].temp_y
;
sum.z
+=
force.z
+
localData[tgx].temp_z
;
global_sum
+=
force.w
+
localData[tgx].temp_w
;
forceBuffers[offset]
=
sum
;
forceBuffers[offset]
=
sum
;
global_bornForce[offset]
=
global_sum
;
global_bornForce[offset]
=
global_sum
;
}
}
//
barrier
not
required
here
as
localData[*]/temp_*
is
not
accessed
before
encountering
another
barrier.
}
}
else
{
else
{
//
This
is
an
off-diagonal
tile.
//
This
is
an
off-diagonal
tile.
...
@@ -330,18 +345,18 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
...
@@ -330,18 +345,18 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
localData[get_local_id
(
0
)
].q
=
tempPosq.w
;
localData[get_local_id
(
0
)
].q
=
tempPosq.w
;
localData[get_local_id
(
0
)
].bornRadius
=
global_bornRadii[j]
;
localData[get_local_id
(
0
)
].bornRadius
=
global_bornRadii[j]
;
}
}
local
Data
[get_local_id
(
0
)
].
f
x
=
0.0f
;
local
Force
[get_local_id
(
0
)
].x
=
0.0f
;
local
Data
[get_local_id
(
0
)
].
f
y
=
0.0f
;
local
Force
[get_local_id
(
0
)
].y
=
0.0f
;
local
Data
[get_local_id
(
0
)
].
f
z
=
0.0f
;
local
Force
[get_local_id
(
0
)
].z
=
0.0f
;
local
Data
[get_local_id
(
0
)
].
f
w
=
0.0f
;
local
Force
[get_local_id
(
0
)
].w
=
0.0f
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Compute
the
full
set
of
interactions
in
this
tile.
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
tj
=
tgx
%
(
TILE_SIZE
/2
)
;
unsigned
int
tj
=
(
tgx
+baseLocalAtom
)
&
(
TILE_SIZE
-1
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+
baseLocalAtom+
tj
<
NUM_ATOMS
)
;
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+tj
<
NUM_ATOMS
)
;
float4
posq2
=
(
float4
)
(
localData[
baseLocalAtom+
tj].x,
localData[
baseLocalAtom+
tj].y,
localData[
baseLocalAtom+
tj].z,
localData[
baseLocalAtom+
tj].q
)
;
float4
posq2
=
(
float4
)
(
localData[tj].x,
localData[tj].y,
localData[tj].z,
localData[tj].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
...
@@ -351,7 +366,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
...
@@ -351,7 +366,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
invR
=
RSQRT
(
r2
)
;
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
float
r
=
RECIP
(
invR
)
;
float
bornRadius2
=
localData[
baseLocalAtom+
tj].bornRadius
;
float
bornRadius2
=
localData[tj].bornRadius
;
float
alpha2_ij
=
bornRadius1*bornRadius2
;
float
alpha2_ij
=
bornRadius1*bornRadius2
;
float
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
float
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
float
expTerm
=
EXP
(
-D_ij
)
;
float
expTerm
=
EXP
(
-D_ij
)
;
...
@@ -370,18 +385,22 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
...
@@ -370,18 +385,22 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
energy
+=
select
(
0.0f,
tempEnergy,
includeInteraction
)
;
energy
+=
select
(
0.0f,
tempEnergy,
includeInteraction
)
;
delta.xyz
*=
select
(
0.0f,
dEdR,
includeInteraction
)
;
delta.xyz
*=
select
(
0.0f,
dEdR,
includeInteraction
)
;
force.xyz
-=
delta.xyz
;
force.xyz
-=
delta.xyz
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
x
+=
delta.x
;
local
Force[tj+localForce
Offset].x
+=
delta.x
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
y
+=
delta.y
;
local
Force[tj+localForce
Offset].y
+=
delta.y
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
z
+=
delta.z
;
local
Force[tj+localForce
Offset].z
+=
delta.z
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
w
+=
select
(
0.0f,
dGpol_dalpha2_ij*bornRadius1,
includeInteraction
)
;
local
Force[tj+localForce
Offset].w
+=
select
(
0.0f,
dGpol_dalpha2_ij*bornRadius1,
includeInteraction
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
tj
=
(
tj+1
)
%
(
TILE_SIZE
/2
)
;
tj
=
(
tj+1
)
&
(
TILE_SIZE
-1
)
;
}
}
//
Sum
the
forces
and
write
results.
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
{
tempBuffer[tgx]
=
force
;
localData[tgx].temp_x
=
force.x
;
localData[tgx].temp_y
=
force.y
;
localData[tgx].temp_z
=
force.z
;
localData[tgx].temp_w
=
force.w
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
...
@@ -396,17 +415,20 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
...
@@ -396,17 +415,20 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
float4
sum2
=
forceBuffers[offset2]
;
float4
sum2
=
forceBuffers[offset2]
;
float
global_sum1
=
global_bornForce[offset1]
;
float
global_sum1
=
global_bornForce[offset1]
;
float
global_sum2
=
global_bornForce[offset2]
;
float
global_sum2
=
global_bornForce[offset2]
;
sum1.xyz
+=
force.xyz
+
tempBuffer[tgx].xyz
;
sum1.x
+=
force.x
+
localData[tgx].temp_x
;
global_sum1
+=
force.w
+
tempBuffer[tgx].w
;
sum1.y
+=
force.y
+
localData[tgx].temp_y
;
sum2.x
+=
localData[get_local_id
(
0
)
].fx
+
localData[get_local_id
(
0
)
+TILE_SIZE].fx
;
sum1.z
+=
force.z
+
localData[tgx].temp_z
;
sum2.y
+=
localData[get_local_id
(
0
)
].fy
+
localData[get_local_id
(
0
)
+TILE_SIZE].fy
;
global_sum1
+=
force.w
+
localData[tgx].temp_w
;
sum2.z
+=
localData[get_local_id
(
0
)
].fz
+
localData[get_local_id
(
0
)
+TILE_SIZE].fz
;
sum2.x
+=
localForce[get_local_id
(
0
)
].x
+
localForce[get_local_id
(
0
)
+TILE_SIZE].x
;
global_sum2
+=
localData[get_local_id
(
0
)
].fw
+
localData[get_local_id
(
0
)
+TILE_SIZE].fw
;
sum2.y
+=
localForce[get_local_id
(
0
)
].y
+
localForce[get_local_id
(
0
)
+TILE_SIZE].y
;
sum2.z
+=
localForce[get_local_id
(
0
)
].z
+
localForce[get_local_id
(
0
)
+TILE_SIZE].z
;
global_sum2
+=
localForce[get_local_id
(
0
)
].w
+
localForce[get_local_id
(
0
)
+TILE_SIZE].w
;
forceBuffers[offset1]
=
sum1
;
forceBuffers[offset1]
=
sum1
;
forceBuffers[offset2]
=
sum2
;
forceBuffers[offset2]
=
sum2
;
global_bornForce[offset1]
=
global_sum1
;
global_bornForce[offset1]
=
global_sum1
;
global_bornForce[offset2]
=
global_sum2
;
global_bornForce[offset2]
=
global_sum2
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
}
lasty
=
y
;
lasty
=
y
;
pos++
;
pos++
;
...
...
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
View file @
fed50628
...
@@ -22,7 +22,6 @@ __kernel void computeBornSum(
...
@@ -22,7 +22,6 @@ __kernel void computeBornSum(
__global
float*
restrict
global_bornSum,
__global
float*
restrict
global_bornSum,
#
endif
#
endif
__global
const
float4*
restrict
posq,
__global
const
float2*
restrict
global_params,
__global
const
float4*
restrict
posq,
__global
const
float2*
restrict
global_params,
__local
AtomData1*
restrict
localData,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags,
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags,
#
else
#
else
...
@@ -40,6 +39,7 @@ __kernel void computeBornSum(
...
@@ -40,6 +39,7 @@ __kernel void computeBornSum(
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
#
endif
#
endif
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
AtomData1
localData[FORCE_WORK_GROUP_SIZE]
;
__local
float
tempBuffer[FORCE_WORK_GROUP_SIZE]
;
__local
float
tempBuffer[FORCE_WORK_GROUP_SIZE]
;
__local
int2
reservedBlocks[WARPS_PER_GROUP]
;
__local
int2
reservedBlocks[WARPS_PER_GROUP]
;
__local
unsigned
int*
exclusionRange
=
(
__local
unsigned
int*
)
reservedBlocks
;
__local
unsigned
int*
exclusionRange
=
(
__local
unsigned
int*
)
reservedBlocks
;
...
@@ -344,7 +344,6 @@ __kernel void computeGBSAForce1(
...
@@ -344,7 +344,6 @@ __kernel void computeGBSAForce1(
__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
#endif
#endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
#else
#else
...
@@ -363,6 +362,7 @@ __kernel void computeGBSAForce1(
...
@@ -363,6 +362,7 @@ __kernel void computeGBSAForce1(
#endif
#endif
float energy = 0.0f;
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
unsigned int lasty = 0xFFFFFFFF;
__local AtomData2 localData[FORCE_WORK_GROUP_SIZE];
__local float4 tempBuffer[FORCE_WORK_GROUP_SIZE];
__local float4 tempBuffer[FORCE_WORK_GROUP_SIZE];
__local int2 reservedBlocks[WARPS_PER_GROUP];
__local int2 reservedBlocks[WARPS_PER_GROUP];
__local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
__local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
...
...
platforms/opencl/src/kernels/nonbonded_cpu.cl
View file @
fed50628
...
@@ -12,7 +12,7 @@ typedef struct {
...
@@ -12,7 +12,7 @@ typedef struct {
*/
*/
__kernel
void
computeNonbonded
(
__global
float4*
restrict
forceBuffers,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__kernel
void
computeNonbonded
(
__global
float4*
restrict
forceBuffers,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
...
@@ -30,6 +30,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
...
@@ -30,6 +30,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
#
endif
#
endif
float
energy
=
0.0f
;
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
AtomData
localData[TILE_SIZE]
;
while
(
pos
<
end
)
{
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
...
...
platforms/opencl/src/kernels/nonbonded_default.cl
View file @
fed50628
#
define
TILE_SIZE
32
#
define
TILE_SIZE
32
//
Cannot
use
float3
as
OpenCL
defines
it
to
be
4
DWORD
aligned.
This
would
//
cause
every
element
of
array
to
have
DWORD
of
padding
to
make
it
4
DWORD
//
aligned
which
wastes
space
and
causes
LDS
bank
conflicts
as
stride
is
no
//
longer
odd
DWORDS.
typedef
struct
{
float
x,
y,
z
;
}
UnalignedFloat3
;
typedef
struct
{
typedef
struct
{
float
x,
y,
z
;
float
x,
y,
z
;
float
q
;
float
q
;
float
fx,
fy,
fz
;
float
fx,
fy,
fz
;
ATOM_PARAMETER_DATA
ATOM_PARAMETER_DATA
#
ifndef
PARAMETER_SIZE_IS_EVEN
float
padding
;
#
endif
}
AtomData
;
}
AtomData
;
/**
/**
...
@@ -13,7 +24,7 @@ typedef struct {
...
@@ -13,7 +24,7 @@ typedef struct {
__kernel
__attribute__
((
reqd_work_group_size
(
FORCE_WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
FORCE_WORK_GROUP_SIZE,
1
,
1
)))
void
computeNonbonded
(
__global
float4*
restrict
forceBuffers,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
void
computeNonbonded
(
__global
float4*
restrict
forceBuffers,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
...
@@ -31,9 +42,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -31,9 +42,12 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#
endif
#
endif
float
energy
=
0.0f
;
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
float
tempBuffer[3*
(
FORCE_WORK_GROUP_SIZE/2
)
]
;
__local
AtomData
localData[TILE_SIZE]
;
__local
UnalignedFloat3
localForce[FORCE_WORK_GROUP_SIZE]
;
#
ifdef
USE_EXCLUSIONS
__local
unsigned
int
exclusionRange[2]
;
__local
unsigned
int
exclusionRange[2]
;
__local
int
exclusionIndex[1]
;
__local
int
exclusionIndex[1]
;
#
endif
while
(
pos
<
end
)
{
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
...
@@ -56,7 +70,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -56,7 +70,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
}
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int
forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 :
TILE_SIZE);
unsigned int
localForceOffset = get_local_id(0) & ~(
TILE_SIZE
-1
);
unsigned int atom1 = x*TILE_SIZE + tgx;
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
float4 posq1 = posq[atom1];
...
@@ -79,12 +93,14 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -79,12 +93,14 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
if (x == y) {
if (x == y) {
// This tile is on the diagonal.
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
if (get_local_id(0) < TILE_SIZE) {
const unsigned int localAtomIndex = tgx;
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w;
localData[localAtomIndex].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
LOAD_LOCAL_PARAMETERS_FROM_1
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
...
@@ -93,7 +109,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -93,7 +109,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
bool isExcluded = !(excl & 0x1);
#endif
#endif
int atom2 = baseLocalAtom+j;
unsigned
int atom2 = baseLocalAtom+j;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
...
@@ -125,14 +141,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -125,14 +141,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
// Sum the forces and write results.
// Sum the forces and write results.
int bufferIndex = 3*tgx;
if (get_local_id(0) >= TILE_SIZE) {
if (get_local_id(0) >= TILE_SIZE) {
tempBuffer[bufferIndex]
= force.x;
localData[tgx].fx
= force.x;
tempBuffer[bufferIndex+1]
= force.y;
localData[tgx].fy
= force.y;
tempBuffer[bufferIndex+2]
= force.z;
localData[tgx].fz
= force.z;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
if (get_local_id(0) < TILE_SIZE) {
force.x += localData[tgx].fx;
force.y += localData[tgx].fy;
force.z += localData[tgx].fz;
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
#else
...
@@ -140,15 +158,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -140,15 +158,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#endif
#endif
// Cheaper to load/store float4 than float3.
// Cheaper to load/store float4 than float3.
float4 sum = forceBuffers[offset];
float4 sum = forceBuffers[offset];
sum += force
+ (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f)
;
sum
.xyz
+= force
.xyz
;
forceBuffers[offset] = sum;
forceBuffers[offset] = sum;
}
}
// barrier not required here as localData[*].temp is not accessed before encountering another barrier.
}
}
else {
else {
// This is an off-diagonal tile.
// This is an off-diagonal tile.
const unsigned int localAtomIndex =
get_local_id(0)
;
if (lasty != y &&
get_local_id(0)
< TILE_SIZE) {
if (lasty != y &&
localAtomIndex
< TILE_SIZE) {
const unsigned int
localAtomIndex
= tgx;
unsigned int j = y*TILE_SIZE + tgx;
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
float4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].x = tempPosq.x;
...
@@ -157,26 +176,23 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -157,26 +176,23 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
localData[localAtomIndex].q = tempPosq.w;
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
}
local
Data[localAtomIndex
].
f
x = 0.0f;
local
Force[get_local_id(0)
].x = 0.0f;
local
Data[localAtomIndex
].
f
y = 0.0f;
local
Force[get_local_id(0)
].y = 0.0f;
local
Data[localAtomIndex
].
f
z = 0.0f;
local
Force[get_local_id(0)
].z = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
// Compute the full set of interactions in this tile.
unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
excl = (excl >> baseLocalAtom) & 0xFFFF;
excl = (excl >> tj) |
(
excl
<<
(
TILE_SIZE
-
tj
))
;
excl += excl << 16;
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
#
endif
unsigned
int
tj
=
tgx%
(
TILE_SIZE/2
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
#
endif
int
atom2
=
baseLocalAtom+tj
;
float4
posq2
=
(
float4
)
(
localData[tj].x,
localData[tj].y,
localData[tj].z,
localData[tj].q
)
;
float4
posq2
=
(
float4
)
(
localData[atom2].x,
localData[atom2].y,
localData[atom2].z,
localData[atom2].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
...
@@ -186,8 +202,9 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -186,8 +202,9 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
invR
=
RSQRT
(
r2
)
;
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
float
r
=
RECIP
(
invR
)
;
int
atom2
=
tj
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+
baseLocalAtom+
tj
;
atom2
=
y*TILE_SIZE+tj
;
#
ifdef
USE_SYMMETRIC
#
ifdef
USE_SYMMETRIC
float
dEdR
=
0.0f
;
float
dEdR
=
0.0f
;
#
else
#
else
...
@@ -200,29 +217,28 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -200,29 +217,28 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#
ifdef
USE_SYMMETRIC
#
ifdef
USE_SYMMETRIC
delta.xyz
*=
dEdR
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
force.xyz
-=
delta.xyz
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
x
+=
delta.x
;
local
Force[tj+localForce
Offset].x
+=
delta.x
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
y
+=
delta.y
;
local
Force[tj+localForce
Offset].y
+=
delta.y
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
z
+=
delta.z
;
local
Force[tj+localForce
Offset].z
+=
delta.z
;
#
else
#
else
force.xyz
-=
dEdR1.xyz
;
force.xyz
-=
dEdR1.xyz
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
x
+=
dEdR2.x
;
local
Force[tj+localForce
Offset].x
+=
dEdR2.x
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
y
+=
dEdR2.y
;
local
Force[tj+localForce
Offset].y
+=
dEdR2.y
;
local
Data[baseLocalAtom+tj+forceBuffer
Offset].
f
z
+=
dEdR2.z
;
local
Force[tj+localForce
Offset].z
+=
dEdR2.z
;
#
endif
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
excl
>>=
1
;
#
endif
#
endif
tj
=
(
tj+1
)
%
(
TILE_SIZE
/2
)
;
tj
=
(
tj+1
)
&
(
TILE_SIZE
-1
)
;
}
}
//
Sum
the
forces
and
write
results.
//
Sum
the
forces
and
write
results.
int
bufferIndex
=
3*tgx
;
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
{
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
{
tempBuffer[bufferIndex]
=
force.x
;
localData[tgx].fx
=
force.x
;
tempBuffer[bufferIndex+1]
=
force.y
;
localData[tgx].fy
=
force.y
;
tempBuffer[bufferIndex+2]
=
force.z
;
localData[tgx].fz
=
force.z
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
...
@@ -236,11 +252,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -236,11 +252,16 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
//
Cheaper
to
load/store
float4
than
float3.
Do
all
loads
before
all
stores
to
minimize
store-load
waits.
//
Cheaper
to
load/store
float4
than
float3.
Do
all
loads
before
all
stores
to
minimize
store-load
waits.
float4
sum1
=
forceBuffers[offset1]
;
float4
sum1
=
forceBuffers[offset1]
;
float4
sum2
=
forceBuffers[offset2]
;
float4
sum2
=
forceBuffers[offset2]
;
sum1
+=
force
+
(
float4
)
(
tempBuffer[bufferIndex],
tempBuffer[bufferIndex+1],
tempBuffer[bufferIndex+2],
0.0f
)
;
sum1.x
+=
localData[tgx].fx
+
force.x
;
sum2
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx+localData[get_local_id
(
0
)
+TILE_SIZE].fx,
localData[get_local_id
(
0
)
].fy+localData[get_local_id
(
0
)
+TILE_SIZE].fy,
localData[get_local_id
(
0
)
].fz+localData[get_local_id
(
0
)
+TILE_SIZE].fz,
0.0f
)
;
sum1.y
+=
localData[tgx].fy
+
force.y
;
sum1.z
+=
localData[tgx].fz
+
force.z
;
sum2.x
+=
localForce[tgx].x
+
localForce[tgx+TILE_SIZE].x
;
sum2.y
+=
localForce[tgx].y
+
localForce[tgx+TILE_SIZE].y
;
sum2.z
+=
localForce[tgx].z
+
localForce[tgx+TILE_SIZE].z
;
forceBuffers[offset1]
=
sum1
;
forceBuffers[offset1]
=
sum1
;
forceBuffers[offset2]
=
sum2
;
forceBuffers[offset2]
=
sum2
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
}
lasty
=
y
;
lasty
=
y
;
pos++
;
pos++
;
...
...
platforms/opencl/src/kernels/nonbonded_nvidia.cl
View file @
fed50628
...
@@ -10,6 +10,9 @@ typedef struct {
...
@@ -10,6 +10,9 @@ typedef struct {
float
q
;
float
q
;
float
fx,
fy,
fz
;
float
fx,
fy,
fz
;
ATOM_PARAMETER_DATA
ATOM_PARAMETER_DATA
#
ifndef
PARAMETER_SIZE_IS_EVEN
float
padding
;
#
endif
}
AtomData
;
}
AtomData
;
/**
/**
...
@@ -22,7 +25,7 @@ __kernel void computeNonbonded(
...
@@ -22,7 +25,7 @@ __kernel void computeNonbonded(
__global
float4*
restrict
forceBuffers,
__global
float4*
restrict
forceBuffers,
#
endif
#
endif
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
...
@@ -41,6 +44,7 @@ __kernel void computeNonbonded(
...
@@ -41,6 +44,7 @@ __kernel void computeNonbonded(
unsigned
int
end
=
startTileIndex+
(
warp+1
)
*numTiles/totalWarps
;
unsigned
int
end
=
startTileIndex+
(
warp+1
)
*numTiles/totalWarps
;
#
endif
#
endif
float
energy
=
0.0f
;
float
energy
=
0.0f
;
__local
AtomData
localData[FORCE_WORK_GROUP_SIZE]
;
__local
float
tempBuffer[3*FORCE_WORK_GROUP_SIZE]
;
__local
float
tempBuffer[3*FORCE_WORK_GROUP_SIZE]
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment