Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
9a4e2b02
Commit
9a4e2b02
authored
Aug 18, 2010
by
Peter Eastman
Browse files
Beginning of changes to reduce memory use for large systems
parent
2bdb84bd
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
418 additions
and
232 deletions
+418
-232
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+9
-4
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+56
-22
platforms/opencl/src/OpenCLNonbondedUtilities.h
platforms/opencl/src/OpenCLNonbondedUtilities.h
+10
-2
platforms/opencl/src/kernels/customGBEnergyN2_default.cl
platforms/opencl/src/kernels/customGBEnergyN2_default.cl
+41
-21
platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
+42
-21
platforms/opencl/src/kernels/customGBValueN2_default.cl
platforms/opencl/src/kernels/customGBValueN2_default.cl
+41
-21
platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
+43
-22
platforms/opencl/src/kernels/gbsaObc_default.cl
platforms/opencl/src/kernels/gbsaObc_default.cl
+44
-36
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+48
-40
platforms/opencl/src/kernels/nonbonded_default.cl
platforms/opencl/src/kernels/nonbonded_default.cl
+41
-21
platforms/opencl/src/kernels/nonbonded_nvidia.cl
platforms/opencl/src/kernels/nonbonded_nvidia.cl
+43
-22
No files found.
platforms/opencl/src/OpenCLKernels.cpp
View file @
9a4e2b02
...
@@ -1690,6 +1690,7 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1690,6 +1690,7 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
defines
[
"PREFACTOR"
]
=
doubleToString
(
prefactor
);
defines
[
"PREFACTOR"
]
=
doubleToString
(
prefactor
);
defines
[
"NUM_ATOMS"
]
=
intToString
(
cl
.
getNumAtoms
());
defines
[
"NUM_ATOMS"
]
=
intToString
(
cl
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
intToString
(
cl
.
getPaddedNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
intToString
(
cl
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
cl
.
getNumAtomBlocks
());
string
file
=
(
cl
.
getSIMDWidth
()
==
32
?
OpenCLKernelSources
::
gbsaObc_nvidia
:
OpenCLKernelSources
::
gbsaObc_default
);
string
file
=
(
cl
.
getSIMDWidth
()
==
32
?
OpenCLKernelSources
::
gbsaObc_nvidia
:
OpenCLKernelSources
::
gbsaObc_default
);
cl
::
Program
program
=
cl
.
createProgram
(
file
,
defines
);
cl
::
Program
program
=
cl
.
createProgram
(
file
,
defines
);
int
index
=
0
;
int
index
=
0
;
...
@@ -1985,6 +1986,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
...
@@ -1985,6 +1986,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
defines
[
"CUTOFF_SQUARED"
]
=
doubleToString
(
force
.
getCutoffDistance
()
*
force
.
getCutoffDistance
());
defines
[
"CUTOFF_SQUARED"
]
=
doubleToString
(
force
.
getCutoffDistance
()
*
force
.
getCutoffDistance
());
defines
[
"NUM_ATOMS"
]
=
intToString
(
cl
.
getNumAtoms
());
defines
[
"NUM_ATOMS"
]
=
intToString
(
cl
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
intToString
(
cl
.
getPaddedNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
intToString
(
cl
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
cl
.
getNumAtomBlocks
());
string
file
=
(
cl
.
getSIMDWidth
()
==
32
?
OpenCLKernelSources
::
customGBValueN2_nvidia
:
OpenCLKernelSources
::
customGBValueN2_default
);
string
file
=
(
cl
.
getSIMDWidth
()
==
32
?
OpenCLKernelSources
::
customGBValueN2_nvidia
:
OpenCLKernelSources
::
customGBValueN2_default
);
cl
::
Program
program
=
cl
.
createProgram
(
cl
.
replaceStrings
(
file
,
replacements
),
defines
);
cl
::
Program
program
=
cl
.
createProgram
(
cl
.
replaceStrings
(
file
,
replacements
),
defines
);
pairValueKernel
=
cl
::
Kernel
(
program
,
"computeN2Value"
);
pairValueKernel
=
cl
::
Kernel
(
program
,
"computeN2Value"
);
...
@@ -2131,6 +2133,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
...
@@ -2131,6 +2133,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
defines
[
"CUTOFF_SQUARED"
]
=
doubleToString
(
force
.
getCutoffDistance
()
*
force
.
getCutoffDistance
());
defines
[
"CUTOFF_SQUARED"
]
=
doubleToString
(
force
.
getCutoffDistance
()
*
force
.
getCutoffDistance
());
defines
[
"NUM_ATOMS"
]
=
intToString
(
cl
.
getNumAtoms
());
defines
[
"NUM_ATOMS"
]
=
intToString
(
cl
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
intToString
(
cl
.
getPaddedNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
intToString
(
cl
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
cl
.
getNumAtomBlocks
());
string
file
=
(
cl
.
getSIMDWidth
()
==
32
?
OpenCLKernelSources
::
customGBEnergyN2_nvidia
:
OpenCLKernelSources
::
customGBEnergyN2_default
);
string
file
=
(
cl
.
getSIMDWidth
()
==
32
?
OpenCLKernelSources
::
customGBEnergyN2_nvidia
:
OpenCLKernelSources
::
customGBEnergyN2_default
);
cl
::
Program
program
=
cl
.
createProgram
(
cl
.
replaceStrings
(
file
,
replacements
),
defines
);
cl
::
Program
program
=
cl
.
createProgram
(
cl
.
replaceStrings
(
file
,
replacements
),
defines
);
pairEnergyKernel
=
cl
::
Kernel
(
program
,
"computeN2Energy"
);
pairEnergyKernel
=
cl
::
Kernel
(
program
,
"computeN2Energy"
);
...
@@ -2393,6 +2396,7 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
...
@@ -2393,6 +2396,7 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairValueKernel
.
setArg
(
index
++
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float4
),
NULL
);
pairValueKernel
.
setArg
(
index
++
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float4
),
NULL
);
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusions
().
getDeviceBuffer
());
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusions
().
getDeviceBuffer
());
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusionIndices
().
getDeviceBuffer
());
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusionIndices
().
getDeviceBuffer
());
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusionRowIndices
().
getDeviceBuffer
());
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
valueBuffers
->
getDeviceBuffer
());
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
valueBuffers
->
getDeviceBuffer
());
pairValueKernel
.
setArg
(
index
++
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
pairValueKernel
.
setArg
(
index
++
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
pairValueKernel
.
setArg
(
index
++
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
pairValueKernel
.
setArg
(
index
++
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
...
@@ -2442,6 +2446,7 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
...
@@ -2442,6 +2446,7 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel
.
setArg
(
index
++
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float4
),
NULL
);
pairEnergyKernel
.
setArg
(
index
++
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float4
),
NULL
);
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusions
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusions
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusionIndices
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusionIndices
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusionRowIndices
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
(
index
++
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float4
),
NULL
);
pairEnergyKernel
.
setArg
(
index
++
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float4
),
NULL
);
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
...
@@ -2520,10 +2525,10 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
...
@@ -2520,10 +2525,10 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
globals
->
upload
(
globalParamValues
);
globals
->
upload
(
globalParamValues
);
}
}
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
pairValueKernel
.
setArg
<
mm_float4
>
(
1
0
,
cl
.
getPeriodicBoxSize
());
pairValueKernel
.
setArg
<
mm_float4
>
(
1
1
,
cl
.
getPeriodicBoxSize
());
pairValueKernel
.
setArg
<
mm_float4
>
(
1
1
,
cl
.
getInvPeriodicBoxSize
());
pairValueKernel
.
setArg
<
mm_float4
>
(
1
2
,
cl
.
getInvPeriodicBoxSize
());
pairEnergyKernel
.
setArg
<
mm_float4
>
(
1
1
,
cl
.
getPeriodicBoxSize
());
pairEnergyKernel
.
setArg
<
mm_float4
>
(
1
2
,
cl
.
getPeriodicBoxSize
());
pairEnergyKernel
.
setArg
<
mm_float4
>
(
1
2
,
cl
.
getInvPeriodicBoxSize
());
pairEnergyKernel
.
setArg
<
mm_float4
>
(
1
3
,
cl
.
getInvPeriodicBoxSize
());
}
}
cl
.
executeKernel
(
pairValueKernel
,
nb
.
getTiles
().
getSize
()
*
OpenCLContext
::
TileSize
);
cl
.
executeKernel
(
pairValueKernel
,
nb
.
getTiles
().
getSize
()
*
OpenCLContext
::
TileSize
);
cl
.
executeKernel
(
perParticleValueKernel
,
cl
.
getPaddedNumAtoms
());
cl
.
executeKernel
(
perParticleValueKernel
,
cl
.
getPaddedNumAtoms
());
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
9a4e2b02
...
@@ -30,12 +30,14 @@
...
@@ -30,12 +30,14 @@
#include "OpenCLKernelSources.h"
#include "OpenCLKernelSources.h"
#include "OpenCLExpressionUtilities.h"
#include "OpenCLExpressionUtilities.h"
#include <map>
#include <map>
#include <set>
#include <utility>
using
namespace
OpenMM
;
using
namespace
OpenMM
;
using
namespace
std
;
using
namespace
std
;
OpenCLNonbondedUtilities
::
OpenCLNonbondedUtilities
(
OpenCLContext
&
context
)
:
context
(
context
),
cutoff
(
-
1.0
),
useCutoff
(
false
),
OpenCLNonbondedUtilities
::
OpenCLNonbondedUtilities
(
OpenCLContext
&
context
)
:
context
(
context
),
cutoff
(
-
1.0
),
useCutoff
(
false
),
numForceBuffers
(
0
),
tiles
(
NULL
),
exclusionInd
ex
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactionFlags
(
NULL
),
numForceBuffers
(
0
),
tiles
(
NULL
),
exclusionInd
ices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactionFlags
(
NULL
),
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
compact
(
NULL
)
{
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
compact
(
NULL
)
{
// Decide how many force buffers to use.
// Decide how many force buffers to use.
...
@@ -52,8 +54,10 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
...
@@ -52,8 +54,10 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
OpenCLNonbondedUtilities
::~
OpenCLNonbondedUtilities
()
{
OpenCLNonbondedUtilities
::~
OpenCLNonbondedUtilities
()
{
if
(
tiles
!=
NULL
)
if
(
tiles
!=
NULL
)
delete
tiles
;
delete
tiles
;
if
(
exclusionIndex
!=
NULL
)
if
(
exclusionIndices
!=
NULL
)
delete
exclusionIndex
;
delete
exclusionIndices
;
if
(
exclusionRowIndices
!=
NULL
)
delete
exclusionRowIndices
;
if
(
exclusions
!=
NULL
)
if
(
exclusions
!=
NULL
)
delete
exclusions
;
delete
exclusions
;
if
(
interactingTiles
!=
NULL
)
if
(
interactingTiles
!=
NULL
)
...
@@ -153,16 +157,36 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
...
@@ -153,16 +157,36 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
// Build a list of indices for the tiles with exclusions.
// Build a list of indices for the tiles with exclusions.
exclusionIndex
=
new
OpenCLArray
<
cl_uint
>
(
context
,
numTiles
,
"exclusionIndex"
);
set
<
pair
<
int
,
int
>
>
tilesWithExclusions
;
vector
<
cl_uint
>
exclusionIndexVec
(
exclusionIndex
->
getSize
());
for
(
int
atom1
=
0
;
atom1
<
(
int
)
atomExclusions
.
size
();
++
atom1
)
{
int
numWithExclusions
=
0
;
int
x
=
atom1
/
OpenCLContext
::
TileSize
;
for
(
int
i
=
0
;
i
<
numTiles
;
++
i
)
for
(
int
j
=
0
;
j
<
(
int
)
atomExclusions
[
atom1
].
size
();
++
j
)
{
if
((
tileVec
[
i
]
&
1
)
==
1
)
int
atom2
=
atomExclusions
[
atom1
][
j
];
exclusionIndexVec
[
i
]
=
(
numWithExclusions
++
)
*
OpenCLContext
::
TileSize
;
int
y
=
atom2
/
OpenCLContext
::
TileSize
;
tilesWithExclusions
.
insert
(
make_pair
(
max
(
x
,
y
),
min
(
x
,
y
)));
}
}
if
(
context
.
getPaddedNumAtoms
()
>
context
.
getNumAtoms
())
{
for
(
int
i
=
0
;
i
<
numAtomBlocks
;
++
i
)
tilesWithExclusions
.
insert
(
make_pair
(
numAtomBlocks
-
1
,
i
));
}
vector
<
cl_uint
>
exclusionRowIndicesVec
(
numAtomBlocks
+
1
,
0
);
vector
<
cl_uint
>
exclusionIndicesVec
;
int
currentRow
=
0
;
for
(
set
<
pair
<
int
,
int
>
>::
const_iterator
iter
=
tilesWithExclusions
.
begin
();
iter
!=
tilesWithExclusions
.
end
();
++
iter
)
{
while
(
iter
->
first
!=
currentRow
)
exclusionRowIndicesVec
[
++
currentRow
]
=
exclusionIndicesVec
.
size
();
exclusionIndicesVec
.
push_back
(
iter
->
second
);
}
exclusionRowIndicesVec
[
++
currentRow
]
=
exclusionIndicesVec
.
size
();
exclusionIndices
=
new
OpenCLArray
<
cl_uint
>
(
context
,
exclusionIndicesVec
.
size
(),
"exclusionIndices"
);
exclusionRowIndices
=
new
OpenCLArray
<
cl_uint
>
(
context
,
exclusionRowIndicesVec
.
size
(),
"exclusionRowIndices"
);
exclusionIndices
->
upload
(
exclusionIndicesVec
);
exclusionRowIndices
->
upload
(
exclusionRowIndicesVec
);
// Record the exclusion data.
// Record the exclusion data.
exclusions
=
new
OpenCLArray
<
cl_uint
>
(
context
,
num
WithExclusions
*
OpenCLContext
::
TileSize
,
"exclusions"
);
exclusions
=
new
OpenCLArray
<
cl_uint
>
(
context
,
tiles
WithExclusions
.
size
()
*
OpenCLContext
::
TileSize
,
"exclusions"
);
vector
<
cl_uint
>
exclusionVec
(
exclusions
->
getSize
());
vector
<
cl_uint
>
exclusionVec
(
exclusions
->
getSize
());
for
(
int
i
=
0
;
i
<
exclusions
->
getSize
();
++
i
)
for
(
int
i
=
0
;
i
<
exclusions
->
getSize
();
++
i
)
exclusionVec
[
i
]
=
0xFFFFFFFF
;
exclusionVec
[
i
]
=
0xFFFFFFFF
;
...
@@ -174,12 +198,12 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
...
@@ -174,12 +198,12 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
int
y
=
atom2
/
OpenCLContext
::
TileSize
;
int
y
=
atom2
/
OpenCLContext
::
TileSize
;
int
offset2
=
atom2
-
y
*
OpenCLContext
::
TileSize
;
int
offset2
=
atom2
-
y
*
OpenCLContext
::
TileSize
;
if
(
x
>
y
)
{
if
(
x
>
y
)
{
int
tile
=
x
+
y
*
numAtomBlocks
-
y
*
(
y
+
1
)
/
2
;
int
index
=
findExclusionIndex
(
x
,
y
,
exclusionIndicesVec
,
exclusionRowIndicesVec
)
;
exclusionVec
[
exclusionIndexVec
[
tile
]
+
offset1
]
&=
0xFFFFFFFF
-
(
1
<<
offset2
);
exclusionVec
[
index
+
offset1
]
&=
0xFFFFFFFF
-
(
1
<<
offset2
);
}
}
else
{
else
{
int
tile
=
y
+
x
*
numAtomBlocks
-
x
*
(
x
+
1
)
/
2
;
int
index
=
findExclusionIndex
(
y
,
x
,
exclusionIndicesVec
,
exclusionRowIndicesVec
)
;
exclusionVec
[
exclusionIndexVec
[
tile
]
+
offset2
]
&=
0xFFFFFFFF
-
(
1
<<
offset1
);
exclusionVec
[
index
+
offset2
]
&=
0xFFFFFFFF
-
(
1
<<
offset1
);
}
}
}
}
}
}
...
@@ -193,19 +217,18 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
...
@@ -193,19 +217,18 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
int
y
=
atom2
/
OpenCLContext
::
TileSize
;
int
y
=
atom2
/
OpenCLContext
::
TileSize
;
int
offset2
=
atom2
-
y
*
OpenCLContext
::
TileSize
;
int
offset2
=
atom2
-
y
*
OpenCLContext
::
TileSize
;
if
(
x
>=
y
)
{
if
(
x
>=
y
)
{
int
tile
=
x
+
y
*
numAtomBlocks
-
y
*
(
y
+
1
)
/
2
;
int
index
=
findExclusionIndex
(
x
,
y
,
exclusionIndicesVec
,
exclusionRowIndicesVec
)
;
exclusionVec
[
exclusionIndexVec
[
tile
]
+
offset1
]
&=
0xFFFFFFFF
-
(
1
<<
offset2
);
exclusionVec
[
index
+
offset1
]
&=
0xFFFFFFFF
-
(
1
<<
offset2
);
}
}
if
(
y
>=
x
)
{
if
(
y
>=
x
)
{
int
tile
=
y
+
x
*
numAtomBlocks
-
x
*
(
x
+
1
)
/
2
;
int
index
=
findExclusionIndex
(
y
,
x
,
exclusionIndicesVec
,
exclusionRowIndicesVec
)
;
exclusionVec
[
exclusionIndexVec
[
tile
]
+
offset2
]
&=
0xFFFFFFFF
-
(
1
<<
offset1
);
exclusionVec
[
index
+
offset2
]
&=
0xFFFFFFFF
-
(
1
<<
offset1
);
}
}
}
}
}
}
atomExclusions
.
clear
();
// We won't use this again, so free the memory it used
atomExclusions
.
clear
();
// We won't use this again, so free the memory it used
tiles
->
upload
(
tileVec
);
tiles
->
upload
(
tileVec
);
exclusions
->
upload
(
exclusionVec
);
exclusions
->
upload
(
exclusionVec
);
exclusionIndex
->
upload
(
exclusionIndexVec
);
// Create data structures for the neighbor list.
// Create data structures for the neighbor list.
...
@@ -252,6 +275,15 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
...
@@ -252,6 +275,15 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
}
}
}
}
int
OpenCLNonbondedUtilities
::
findExclusionIndex
(
int
x
,
int
y
,
const
vector
<
cl_uint
>&
exclusionIndices
,
const
vector
<
cl_uint
>&
exclusionRowIndices
)
{
int
start
=
exclusionRowIndices
[
x
];
int
end
=
exclusionRowIndices
[
x
+
1
];
for
(
int
i
=
start
;
i
<
end
;
i
++
)
if
(
exclusionIndices
[
i
]
==
y
)
return
i
*
OpenCLContext
::
TileSize
;
throw
OpenMMException
(
"Internal error: exclusion in unexpected tile"
);
}
void
OpenCLNonbondedUtilities
::
prepareInteractions
()
{
void
OpenCLNonbondedUtilities
::
prepareInteractions
()
{
if
(
!
useCutoff
)
if
(
!
useCutoff
)
return
;
return
;
...
@@ -275,8 +307,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
...
@@ -275,8 +307,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
if
(
tiles
!=
NULL
)
{
if
(
tiles
!=
NULL
)
{
if
(
useCutoff
)
{
if
(
useCutoff
)
{
forceKernel
.
setArg
<
mm_float4
>
(
1
0
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
1
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
1
,
context
.
getInvPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
2
,
context
.
getInvPeriodicBoxSize
());
}
}
context
.
executeKernel
(
forceKernel
,
tiles
->
getSize
()
*
OpenCLContext
::
TileSize
);
context
.
executeKernel
(
forceKernel
,
tiles
->
getSize
()
*
OpenCLContext
::
TileSize
);
}
}
...
@@ -388,6 +420,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
...
@@ -388,6 +420,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines
[
"CUTOFF_SQUARED"
]
=
OpenCLExpressionUtilities
::
doubleToString
(
cutoff
*
cutoff
);
defines
[
"CUTOFF_SQUARED"
]
=
OpenCLExpressionUtilities
::
doubleToString
(
cutoff
*
cutoff
);
defines
[
"NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtoms
());
defines
[
"NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getPaddedNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtomBlocks
());
string
file
=
(
context
.
getSIMDWidth
()
==
32
?
OpenCLKernelSources
::
nonbonded_nvidia
:
OpenCLKernelSources
::
nonbonded_default
);
string
file
=
(
context
.
getSIMDWidth
()
==
32
?
OpenCLKernelSources
::
nonbonded_nvidia
:
OpenCLKernelSources
::
nonbonded_default
);
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
file
,
replacements
),
defines
);
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
file
,
replacements
),
defines
);
cl
::
Kernel
kernel
(
program
,
"computeNonbonded"
);
cl
::
Kernel
kernel
(
program
,
"computeNonbonded"
);
...
@@ -399,7 +432,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
...
@@ -399,7 +432,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
context
.
getEnergyBuffer
().
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
context
.
getEnergyBuffer
().
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
context
.
getPosq
().
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
context
.
getPosq
().
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusions
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusions
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionIndex
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionIndices
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionRowIndices
->
getDeviceBuffer
());
kernel
.
setArg
(
index
++
,
OpenCLContext
::
ThreadBlockSize
*
localDataSize
,
NULL
);
kernel
.
setArg
(
index
++
,
OpenCLContext
::
ThreadBlockSize
*
localDataSize
,
NULL
);
kernel
.
setArg
(
index
++
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float4
),
NULL
);
kernel
.
setArg
(
index
++
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float4
),
NULL
);
if
(
useCutoff
)
{
if
(
useCutoff
)
{
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.h
View file @
9a4e2b02
...
@@ -175,7 +175,13 @@ public:
...
@@ -175,7 +175,13 @@ public:
* Get the array containing the index into the exclusion array for each tile.
* Get the array containing the index into the exclusion array for each tile.
*/
*/
OpenCLArray
<
cl_uint
>&
getExclusionIndices
()
{
OpenCLArray
<
cl_uint
>&
getExclusionIndices
()
{
return
*
exclusionIndex
;
return
*
exclusionIndices
;
}
/**
* Get the array listing where the exclusion data starts for each row.
*/
OpenCLArray
<
cl_uint
>&
getExclusionRowIndices
()
{
return
*
exclusionRowIndices
;
}
}
/**
/**
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
...
@@ -190,14 +196,16 @@ public:
...
@@ -190,14 +196,16 @@ public:
*/
*/
cl
::
Kernel
createInteractionKernel
(
const
std
::
string
&
source
,
const
std
::
vector
<
ParameterInfo
>&
params
,
const
std
::
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
)
const
;
cl
::
Kernel
createInteractionKernel
(
const
std
::
string
&
source
,
const
std
::
vector
<
ParameterInfo
>&
params
,
const
std
::
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
)
const
;
private:
private:
static
int
findExclusionIndex
(
int
x
,
int
y
,
const
std
::
vector
<
cl_uint
>&
exclusionIndices
,
const
std
::
vector
<
cl_uint
>&
exclusionRowIndices
);
OpenCLContext
&
context
;
OpenCLContext
&
context
;
cl
::
Kernel
forceKernel
;
cl
::
Kernel
forceKernel
;
cl
::
Kernel
findBlockBoundsKernel
;
cl
::
Kernel
findBlockBoundsKernel
;
cl
::
Kernel
findInteractingBlocksKernel
;
cl
::
Kernel
findInteractingBlocksKernel
;
cl
::
Kernel
findInteractionsWithinBlocksKernel
;
cl
::
Kernel
findInteractionsWithinBlocksKernel
;
OpenCLArray
<
cl_uint
>*
tiles
;
OpenCLArray
<
cl_uint
>*
tiles
;
OpenCLArray
<
cl_uint
>*
exclusionIndex
;
OpenCLArray
<
cl_uint
>*
exclusions
;
OpenCLArray
<
cl_uint
>*
exclusions
;
OpenCLArray
<
cl_uint
>*
exclusionIndices
;
OpenCLArray
<
cl_uint
>*
exclusionRowIndices
;
OpenCLArray
<
cl_uint
>*
interactingTiles
;
OpenCLArray
<
cl_uint
>*
interactingTiles
;
OpenCLArray
<
cl_uint
>*
interactionFlags
;
OpenCLArray
<
cl_uint
>*
interactionFlags
;
OpenCLArray
<
cl_uint
>*
interactionCount
;
OpenCLArray
<
cl_uint
>*
interactionCount
;
...
...
platforms/opencl/src/kernels/customGBEnergyN2_default.cl
View file @
9a4e2b02
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__local
float4*
tempForceBuffer,
__global
unsigned
int*
tiles,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempForceBuffer,
__global
unsigned
int*
tiles,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
#
else
#
else
...
@@ -23,31 +23,54 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
...
@@ -23,31 +23,54 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
float
energy
=
0.0f
;
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2]
;
__local
int
exclusionIndex[1]
;
DECLARE_TEMP_BUFFERS
DECLARE_TEMP_BUFFERS
while
(
pos
<
end
)
{
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
#
ifdef
USE_CUTOFF
unsigned
int
x
=
tiles[pos]
;
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*TILE_SIZE
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
;
bool
hasExclusions
=
(
x
&
0x1
)
;
x
=
(
x>>17
)
;
x
=
(
x>>17
)
*TILE_SIZE
;
#
else
unsigned
int
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
unsigned
int
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y++
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
#
endif
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
forceBufferOffset
=
(
tgx
<
TILE_SIZE/2
?
0
:
TILE_SIZE
)
;
unsigned
int
forceBufferOffset
=
(
tgx
<
TILE_SIZE/2
?
0
:
TILE_SIZE
)
;
unsigned
int
atom1
=
x
+
tgx
;
unsigned
int
atom1
=
x
*TILE_SIZE
+
tgx
;
float4
force
=
0.0f
;
float4
force
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
//
Locate
the
exclusion
data
for
this
tile.
#
ifdef
USE_EXCLUSIONS
if
(
get_local_id
(
0
)
<
2
)
exclusionRange[get_local_id
(
0
)
]
=
exclusionRowIndices[x+get_local_id
(
0
)
]
;
if
(
tgx
==
0
)
exclusionIndex[0]
=
-1
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
i
=
exclusionRange[0]+tgx
; i < exclusionRange[1]; i += TILE_SIZE)
if
(
exclusionIndices[i]
==
y
)
exclusionIndex[0]
=
i*TILE_SIZE
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
bool
hasExclusions
=
(
exclusionIndex[0]
>
-1
)
;
#
endif
if
(
x
==
y
)
{
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
//
This
tile
is
on
the
diagonal.
local_posq[get_local_id
(
0
)
]
=
posq1
;
local_posq[get_local_id
(
0
)
]
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
LOAD_LOCAL_PARAMETERS_FROM_1
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*
(
xi+1
)
/2
;
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
exclusions[exclusionInd
ices[tile
]+tgx]
>>
baseLocalAtom
;
unsigned
int
excl
=
exclusions[exclusionInd
ex[0
]+tgx]
>>
baseLocalAtom
;
#
endif
#
endif
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
...
@@ -67,7 +90,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
...
@@ -67,7 +90,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
#
endif
#
endif
float
r
=
SQRT
(
r2
)
;
float
r
=
SQRT
(
r2
)
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y+baseLocalAtom+j
;
atom2
=
y
*TILE_SIZE
+baseLocalAtom+j
;
float
dEdR
=
0.0f
;
float
dEdR
=
0.0f
;
float
tempEnergy
=
0.0f
;
float
tempEnergy
=
0.0f
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
...
@@ -94,9 +117,9 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
...
@@ -94,9 +117,9 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset1
=
x
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
#
endif
forceBuffers[offset1].xyz
+=
force.xyz+tempForceBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
forceBuffers[offset1].xyz
+=
force.xyz+tempForceBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
STORE_DERIVATIVES_1
STORE_DERIVATIVES_1
...
@@ -106,7 +129,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
...
@@ -106,7 +129,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
//
This
is
an
off-diagonal
tile.
//
This
is
an
off-diagonal
tile.
if
(
lasty
!=
y
&&
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
lasty
!=
y
&&
get_local_id
(
0
)
<
TILE_SIZE
)
{
unsigned
int
j
=
y
+
tgx
;
unsigned
int
j
=
y
*TILE_SIZE
+
tgx
;
local_posq[get_local_id
(
0
)
]
=
posq[j]
;
local_posq[get_local_id
(
0
)
]
=
posq[j]
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
}
...
@@ -116,11 +139,8 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
...
@@ -116,11 +139,8 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
//
Compute
the
full
set
of
interactions
in
this
tile.
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
yi
=
y/TILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*
(
yi+1
)
/2
;
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionInd
ices[tile
]+tgx]
:
0xFFFFFFFF
)
;
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionInd
ex[0
]+tgx]
:
0xFFFFFFFF
)
;
excl
=
(
excl
>>
baseLocalAtom
)
&
0xFFFF
;
excl
=
(
excl
>>
baseLocalAtom
)
&
0xFFFF
;
excl
+=
excl
<<
16
;
excl
+=
excl
<<
16
;
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
...
@@ -144,7 +164,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
...
@@ -144,7 +164,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
#
endif
#
endif
float
r
=
SQRT
(
r2
)
;
float
r
=
SQRT
(
r2
)
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y+baseLocalAtom+tj
;
atom2
=
y
*TILE_SIZE
+baseLocalAtom+tj
;
float
dEdR
=
0.0f
;
float
dEdR
=
0.0f
;
float
tempEnergy
=
0.0f
;
float
tempEnergy
=
0.0f
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
...
@@ -176,11 +196,11 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
...
@@ -176,11 +196,11 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
y
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset1
=
x
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
#
endif
forceBuffers[offset1].xyz
+=
force.xyz+tempForceBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
forceBuffers[offset1].xyz
+=
force.xyz+tempForceBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
forceBuffers[offset2].xyz
+=
local_force[get_local_id
(
0
)
].xyz+local_force[get_local_id
(
0
)
+TILE_SIZE].xyz
;
forceBuffers[offset2].xyz
+=
local_force[get_local_id
(
0
)
].xyz+local_force[get_local_id
(
0
)
+TILE_SIZE].xyz
;
...
...
platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
View file @
9a4e2b02
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__local
float4*
tempBuffer,
__global
unsigned
int*
tiles,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempBuffer,
__global
unsigned
int*
tiles,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
#
else
#
else
...
@@ -25,28 +25,52 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
...
@@ -25,28 +25,52 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
float
energy
=
0.0f
;
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[4]
;
__local
int
exclusionIndex[2]
;
while
(
pos
<
end
)
{
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
#
ifdef
USE_CUTOFF
unsigned
int
x
=
tiles[pos]
;
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*TILE_SIZE
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
;
bool
hasExclusions
=
(
x
&
0x1
)
;
x
=
(
x>>17
)
;
x
=
(
x>>17
)
*TILE_SIZE
;
#
else
unsigned
int
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
unsigned
int
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y++
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
#
endif
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
unsigned
int
atom1
=
x
+
tgx
;
unsigned
int
atom1
=
x
*TILE_SIZE
+
tgx
;
float4
force
=
0.0f
;
float4
force
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
//
Locate
the
exclusion
data
for
this
tile.
#
ifdef
USE_EXCLUSIONS
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
if
(
tgx
<
2
)
exclusionRange[2*localGroupIndex+tgx]
=
exclusionRowIndices[x+tgx]
;
if
(
tgx
==
0
)
exclusionIndex[localGroupIndex]
=
-1
;
for
(
int
i
=
exclusionRange[2*localGroupIndex]+tgx
; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if
(
exclusionIndices[i]
==
y
)
exclusionIndex[localGroupIndex]
=
i*TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex[localGroupIndex]
>
-1
)
;
#
else
bool
hasExclusions
=
false
;
#
endif
if
(
x
==
y
)
{
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
//
This
tile
is
on
the
diagonal.
local_posq[get_local_id
(
0
)
]
=
posq1
;
local_posq[get_local_id
(
0
)
]
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
LOAD_LOCAL_PARAMETERS_FROM_1
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*
(
xi+1
)
/2
;
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
exclusions[exclusionInd
ices[tile
]+tgx]
;
unsigned
int
excl
=
exclusions[exclusionInd
ex[localGroupIndex
]+tgx]
;
#
endif
#
endif
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
...
@@ -66,7 +90,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
...
@@ -66,7 +90,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
#
endif
#
endif
float
r
=
SQRT
(
r2
)
;
float
r
=
SQRT
(
r2
)
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y+j
;
atom2
=
y
*TILE_SIZE
+j
;
float
dEdR
=
0.0f
;
float
dEdR
=
0.0f
;
float
tempEnergy
=
0.0f
;
float
tempEnergy
=
0.0f
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
...
@@ -86,9 +110,9 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
...
@@ -86,9 +110,9 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
//
Write
results
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset1
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
#
endif
#
endif
forceBuffers[offset1].xyz
+=
force.xyz
;
forceBuffers[offset1].xyz
+=
force.xyz
;
STORE_DERIVATIVES_1
STORE_DERIVATIVES_1
...
@@ -97,7 +121,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
...
@@ -97,7 +121,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
//
This
is
an
off-diagonal
tile.
//
This
is
an
off-diagonal
tile.
if
(
lasty
!=
y
)
{
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
+
tgx
;
unsigned
int
j
=
y
*TILE_SIZE
+
tgx
;
local_posq[get_local_id
(
0
)
]
=
posq[j]
;
local_posq[get_local_id
(
0
)
]
=
posq[j]
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
}
...
@@ -113,11 +137,8 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
...
@@ -113,11 +137,8 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
{
{
//
Compute
the
full
set
of
interactions
in
this
tile.
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
yi
=
y/TILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*
(
yi+1
)
/2
;
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionInd
ices[tile
]+tgx]
:
0xFFFFFFFF
)
;
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionInd
ex[localGroupIndex
]+tgx]
:
0xFFFFFFFF
)
;
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
#
endif
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
...
@@ -139,7 +160,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
...
@@ -139,7 +160,7 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
#
endif
#
endif
float
r
=
SQRT
(
r2
)
;
float
r
=
SQRT
(
r2
)
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y+tj
;
atom2
=
y
*TILE_SIZE
+tj
;
float
dEdR
=
0.0f
;
float
dEdR
=
0.0f
;
float
tempEnergy
=
0.0f
;
float
tempEnergy
=
0.0f
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
...
@@ -164,11 +185,11 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
...
@@ -164,11 +185,11 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
//
Write
results
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
y
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset1
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
#
endif
#
endif
forceBuffers[offset1].xyz
+=
force.xyz
;
forceBuffers[offset1].xyz
+=
force.xyz
;
forceBuffers[offset2].xyz
+=
local_force[get_local_id
(
0
)
].xyz
;
forceBuffers[offset2].xyz
+=
local_force[get_local_id
(
0
)
].xyz
;
...
...
platforms/opencl/src/kernels/customGBValueN2_default.cl
View file @
9a4e2b02
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
float*
global_value,
__local
float*
local_value,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__global
float*
global_value,
__local
float*
local_value,
__local
float*
tempBuffer,
__global
unsigned
int*
tiles,
__local
float*
tempBuffer,
__global
unsigned
int*
tiles,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
...
@@ -21,30 +21,53 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
...
@@ -21,30 +21,53 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
float
energy
=
0.0f
;
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2]
;
__local
int
exclusionIndex[1]
;
while
(
pos
<
end
)
{
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
#
ifdef
USE_CUTOFF
unsigned
int
x
=
tiles[pos]
;
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*TILE_SIZE
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
;
bool
hasExclusions
=
(
x
&
0x1
)
;
x
=
(
x>>17
)
;
x
=
(
x>>17
)
*TILE_SIZE
;
#
else
unsigned
int
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
unsigned
int
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y++
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
#
endif
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
valueBufferOffset
=
(
tgx
<
TILE_SIZE/2
?
0
:
TILE_SIZE
)
;
unsigned
int
valueBufferOffset
=
(
tgx
<
TILE_SIZE/2
?
0
:
TILE_SIZE
)
;
unsigned
int
atom1
=
x
+
tgx
;
unsigned
int
atom1
=
x
*TILE_SIZE
+
tgx
;
float
value
=
0.0f
;
float
value
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
//
Locate
the
exclusion
data
for
this
tile.
#
ifdef
USE_EXCLUSIONS
if
(
get_local_id
(
0
)
<
2
)
exclusionRange[get_local_id
(
0
)
]
=
exclusionRowIndices[x+get_local_id
(
0
)
]
;
if
(
tgx
==
0
)
exclusionIndex[0]
=
-1
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
i
=
exclusionRange[0]+tgx
; i < exclusionRange[1]; i += TILE_SIZE)
if
(
exclusionIndices[i]
==
y
)
exclusionIndex[0]
=
i*TILE_SIZE
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
bool
hasExclusions
=
(
exclusionIndex[0]
>
-1
)
;
#
endif
if
(
x
==
y
)
{
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
//
This
tile
is
on
the
diagonal.
local_posq[get_local_id
(
0
)
]
=
posq1
;
local_posq[get_local_id
(
0
)
]
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
LOAD_LOCAL_PARAMETERS_FROM_1
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*
(
xi+1
)
/2
;
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
exclusions[exclusionInd
ices[tile
]+tgx]
>>
baseLocalAtom
;
unsigned
int
excl
=
exclusions[exclusionInd
ex[0
]+tgx]
>>
baseLocalAtom
;
#
endif
#
endif
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
...
@@ -64,7 +87,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
...
@@ -64,7 +87,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
#
endif
#
endif
float
r
=
SQRT
(
r2
)
;
float
r
=
SQRT
(
r2
)
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y+baseLocalAtom+j
;
atom2
=
y
*TILE_SIZE
+baseLocalAtom+j
;
float
tempValue1
=
0.0f
;
float
tempValue1
=
0.0f
;
float
tempValue2
=
0.0f
;
float
tempValue2
=
0.0f
;
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
...
@@ -90,9 +113,9 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
...
@@ -90,9 +113,9 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset
=
x
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset
=
x
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
#
endif
global_value[offset]
+=
value+tempBuffer[get_local_id
(
0
)
+TILE_SIZE]
;
global_value[offset]
+=
value+tempBuffer[get_local_id
(
0
)
+TILE_SIZE]
;
}
}
...
@@ -101,7 +124,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
...
@@ -101,7 +124,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
//
This
is
an
off-diagonal
tile.
//
This
is
an
off-diagonal
tile.
if
(
lasty
!=
y
&&
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
lasty
!=
y
&&
get_local_id
(
0
)
<
TILE_SIZE
)
{
unsigned
int
j
=
y
+
tgx
;
unsigned
int
j
=
y
*TILE_SIZE
+
tgx
;
local_posq[get_local_id
(
0
)
]
=
posq[j]
;
local_posq[get_local_id
(
0
)
]
=
posq[j]
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
}
...
@@ -110,11 +133,8 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
...
@@ -110,11 +133,8 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
//
Compute
the
full
set
of
interactions
in
this
tile.
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
yi
=
y/TILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*
(
yi+1
)
/2
;
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionInd
ices[tile
]+tgx]
:
0xFFFFFFFF
)
;
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionInd
ex[0
]+tgx]
:
0xFFFFFFFF
)
;
excl
=
(
excl
>>
baseLocalAtom
)
&
0xFFFF
;
excl
=
(
excl
>>
baseLocalAtom
)
&
0xFFFF
;
excl
+=
excl
<<
16
;
excl
+=
excl
<<
16
;
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
...
@@ -138,7 +158,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
...
@@ -138,7 +158,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
#
endif
#
endif
float
r
=
SQRT
(
r2
)
;
float
r
=
SQRT
(
r2
)
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y+baseLocalAtom+tj
;
atom2
=
y
*TILE_SIZE
+baseLocalAtom+tj
;
float
tempValue1
=
0.0f
;
float
tempValue1
=
0.0f
;
float
tempValue2
=
0.0f
;
float
tempValue2
=
0.0f
;
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
...
@@ -167,11 +187,11 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
...
@@ -167,11 +187,11 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
y
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset1
=
x
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
#
endif
global_value[offset1]
+=
value+tempBuffer[get_local_id
(
0
)
+TILE_SIZE]
;
global_value[offset1]
+=
value+tempBuffer[get_local_id
(
0
)
+TILE_SIZE]
;
global_value[offset2]
+=
local_value[get_local_id
(
0
)
]+local_value[get_local_id
(
0
)
+TILE_SIZE]
;
global_value[offset2]
+=
local_value[get_local_id
(
0
)
]+local_value[get_local_id
(
0
)
+TILE_SIZE]
;
...
...
platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
View file @
9a4e2b02
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
float*
global_value,
__local
float*
local_value,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__global
float*
global_value,
__local
float*
local_value,
__local
float*
tempBuffer,
__global
unsigned
int*
tiles,
__local
float*
tempBuffer,
__global
unsigned
int*
tiles,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
...
@@ -23,28 +23,52 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
...
@@ -23,28 +23,52 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
float
energy
=
0.0f
;
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[4]
;
__local
int
exclusionIndex[2]
;
while
(
pos
<
end
)
{
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
#
ifdef
USE_CUTOFF
unsigned
int
x
=
tiles[pos]
;
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*TILE_SIZE
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
;
bool
hasExclusions
=
(
x
&
0x1
)
;
x
=
(
x>>17
)
;
x
=
(
x>>17
)
*TILE_SIZE
;
#
else
unsigned
int
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
unsigned
int
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y++
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
#
endif
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
unsigned
int
atom1
=
x
+
tgx
;
unsigned
int
atom1
=
x
*TILE_SIZE
+
tgx
;
float
value
=
0.0f
;
float
value
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
//
Locate
the
exclusion
data
for
this
tile.
#
ifdef
USE_EXCLUSIONS
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
if
(
tgx
<
2
)
exclusionRange[2*localGroupIndex+tgx]
=
exclusionRowIndices[x+tgx]
;
if
(
tgx
==
0
)
exclusionIndex[localGroupIndex]
=
-1
;
for
(
int
i
=
exclusionRange[2*localGroupIndex]+tgx
; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if
(
exclusionIndices[i]
==
y
)
exclusionIndex[localGroupIndex]
=
i*TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex[localGroupIndex]
>
-1
)
;
#
else
bool
hasExclusions
=
false
;
#
endif
if
(
x
==
y
)
{
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
//
This
tile
is
on
the
diagonal.
local_posq[get_local_id
(
0
)
]
=
posq1
;
local_posq[get_local_id
(
0
)
]
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
LOAD_LOCAL_PARAMETERS_FROM_1
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*
(
xi+1
)
/2
;
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
exclusions[exclusionInd
ices[tile
]+tgx]
;
unsigned
int
excl
=
exclusions[exclusionInd
ex[localGroupIndex
]+tgx]
;
#
endif
#
endif
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
...
@@ -64,7 +88,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
...
@@ -64,7 +88,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
#
endif
#
endif
float
r
=
SQRT
(
r2
)
;
float
r
=
SQRT
(
r2
)
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y+j
;
atom2
=
y
*TILE_SIZE
+j
;
float
tempValue1
=
0.0f
;
float
tempValue1
=
0.0f
;
float
tempValue2
=
0.0f
;
float
tempValue2
=
0.0f
;
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
...
@@ -85,9 +109,9 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
...
@@ -85,9 +109,9 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
//
Write
results
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset
=
x
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
#
endif
#
endif
global_value[offset]
+=
value
;
global_value[offset]
+=
value
;
}
}
...
@@ -95,7 +119,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
...
@@ -95,7 +119,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
//
This
is
an
off-diagonal
tile.
//
This
is
an
off-diagonal
tile.
if
(
lasty
!=
y
)
{
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
+
tgx
;
unsigned
int
j
=
y
*TILE_SIZE
+
tgx
;
local_posq[get_local_id
(
0
)
]
=
posq[j]
;
local_posq[get_local_id
(
0
)
]
=
posq[j]
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
}
...
@@ -125,7 +149,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
...
@@ -125,7 +149,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
if
(
r2
<
CUTOFF_SQUARED
)
{
if
(
r2
<
CUTOFF_SQUARED
)
{
float
r
=
SQRT
(
r2
)
;
float
r
=
SQRT
(
r2
)
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y+j
;
atom2
=
y
*TILE_SIZE
+j
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_VALUE
COMPUTE_VALUE
}
}
...
@@ -154,11 +178,8 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
...
@@ -154,11 +178,8 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
{
{
//
Compute
the
full
set
of
interactions
in
this
tile.
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
yi
=
y/TILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*
(
yi+1
)
/2
;
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionInd
ices[tile
]+tgx]
:
0xFFFFFFFF
)
;
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionInd
ex[localGroupIndex
]+tgx]
:
0xFFFFFFFF
)
;
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
#
endif
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
...
@@ -180,7 +201,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
...
@@ -180,7 +201,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
#
endif
#
endif
float
r
=
SQRT
(
r2
)
;
float
r
=
SQRT
(
r2
)
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y+tj
;
atom2
=
y
*TILE_SIZE
+tj
;
float
tempValue1
=
0.0f
;
float
tempValue1
=
0.0f
;
float
tempValue2
=
0.0f
;
float
tempValue2
=
0.0f
;
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
...
@@ -204,11 +225,11 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
...
@@ -204,11 +225,11 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
//
Write
results
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
y
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset1
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
#
endif
#
endif
global_value[offset1]
+=
value
;
global_value[offset1]
+=
value
;
global_value[offset2]
+=
local_value[get_local_id
(
0
)
]
;
global_value[offset2]
+=
local_value[get_local_id
(
0
)
]
;
...
...
platforms/opencl/src/kernels/gbsaObc_default.cl
View file @
9a4e2b02
...
@@ -31,13 +31,22 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -31,13 +31,22 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
while
(
pos
<
end
)
{
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
#
ifdef
USE_CUTOFF
unsigned
int
x
=
tiles[pos]
;
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*TILE_SIZE
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
;
x
=
(
x>>17
)
*TILE_SIZE
;
x
=
(
x>>17
)
;
#
else
unsigned
int
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
unsigned
int
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y++
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
#
endif
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
forceBufferOffset
=
(
tgx
<
TILE_SIZE/2
?
0
:
TILE_SIZE
)
;
unsigned
int
forceBufferOffset
=
(
tgx
<
TILE_SIZE/2
?
0
:
TILE_SIZE
)
;
unsigned
int
atom1
=
x
+
tgx
;
unsigned
int
atom1
=
x
*TILE_SIZE
+
tgx
;
float
bornSum
=
0.0f
;
float
bornSum
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float4
posq1
=
posq[atom1]
;
float2
params1
=
global_params[atom1]
;
float2
params1
=
global_params[atom1]
;
...
@@ -51,8 +60,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -51,8 +60,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
localData[get_local_id
(
0
)
].radius
=
params1.x
;
localData[get_local_id
(
0
)
].radius
=
params1.x
;
localData[get_local_id
(
0
)
].scaledRadius
=
params1.y
;
localData[get_local_id
(
0
)
].scaledRadius
=
params1.y
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*
(
xi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
float4
delta
=
(
float4
)
(
localData[baseLocalAtom+j].x-posq1.x,
localData[baseLocalAtom+j].y-posq1.y,
localData[baseLocalAtom+j].z-posq1.z,
0.0f
)
;
float4
delta
=
(
float4
)
(
localData[baseLocalAtom+j].x-posq1.x,
localData[baseLocalAtom+j].y-posq1.y,
localData[baseLocalAtom+j].z-posq1.z,
0.0f
)
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
...
@@ -66,9 +73,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -66,9 +73,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float2
params2
=
(
float2
)
(
localData[baseLocalAtom+j].radius,
localData[baseLocalAtom+j].scaledRadius
)
;
float2
params2
=
(
float2
)
(
localData[baseLocalAtom+j].radius,
localData[baseLocalAtom+j].scaledRadius
)
;
float
rScaledRadiusJ
=
r+params2.y
;
float
rScaledRadiusJ
=
r+params2.y
;
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y+baseLocalAtom+j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
&&
(
j+baseLocalAtom
!=
tgx
)
&&
(
params1.x
<
rScaledRadiusJ
))
;
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y
*TILE_SIZE
+baseLocalAtom+j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
&&
(
j+baseLocalAtom
!=
tgx
)
&&
(
params1.x
<
rScaledRadiusJ
))
;
#
else
#
else
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y+baseLocalAtom+j
<
NUM_ATOMS
&&
(
j+baseLocalAtom
!=
tgx
)
&&
(
params1.x
<
rScaledRadiusJ
))
;
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y
*TILE_SIZE
+baseLocalAtom+j
<
NUM_ATOMS
&&
(
j+baseLocalAtom
!=
tgx
)
&&
(
params1.x
<
rScaledRadiusJ
))
;
#
endif
#
endif
float
l_ij
=
RECIP
(
max
(
params1.x,
fabs
(
r-params2.y
)))
;
float
l_ij
=
RECIP
(
max
(
params1.x,
fabs
(
r-params2.y
)))
;
float
u_ij
=
RECIP
(
rScaledRadiusJ
)
;
float
u_ij
=
RECIP
(
rScaledRadiusJ
)
;
...
@@ -87,9 +94,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -87,9 +94,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset
=
x
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset
=
x
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
#
endif
global_bornSum[offset]
+=
bornSum+tempBuffer[get_local_id
(
0
)
+TILE_SIZE]
;
global_bornSum[offset]
+=
bornSum+tempBuffer[get_local_id
(
0
)
+TILE_SIZE]
;
}
}
...
@@ -98,7 +105,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -98,7 +105,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
//
This
is
an
off-diagonal
tile.
//
This
is
an
off-diagonal
tile.
if
(
lasty
!=
y
&&
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
lasty
!=
y
&&
get_local_id
(
0
)
<
TILE_SIZE
)
{
unsigned
int
j
=
y
+
tgx
;
unsigned
int
j
=
y
*TILE_SIZE
+
tgx
;
float4
tempPosq
=
posq[j]
;
float4
tempPosq
=
posq[j]
;
localData[get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[get_local_id
(
0
)
].y
=
tempPosq.y
;
localData[get_local_id
(
0
)
].y
=
tempPosq.y
;
...
@@ -113,9 +120,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -113,9 +120,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
//
Compute
the
full
set
of
interactions
in
this
tile.
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
yi
=
y/TILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*
(
yi+1
)
/2
;
unsigned
int
tj
=
tgx%
(
TILE_SIZE/2
)
;
unsigned
int
tj
=
tgx%
(
TILE_SIZE/2
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
float4
delta
=
(
float4
)
(
localData[baseLocalAtom+tj].x-posq1.x,
localData[baseLocalAtom+tj].y-posq1.y,
localData[baseLocalAtom+tj].z-posq1.z,
0.0f
)
;
float4
delta
=
(
float4
)
(
localData[baseLocalAtom+tj].x-posq1.x,
localData[baseLocalAtom+tj].y-posq1.y,
localData[baseLocalAtom+tj].z-posq1.z,
0.0f
)
;
...
@@ -126,9 +130,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -126,9 +130,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
#
endif
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y+baseLocalAtom+tj
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
;
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y
*TILE_SIZE
+baseLocalAtom+tj
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
;
#
else
#
else
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y+baseLocalAtom+tj
<
NUM_ATOMS
)
;
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y
*TILE_SIZE
+baseLocalAtom+tj
<
NUM_ATOMS
)
;
#
endif
#
endif
float
invR
=
RSQRT
(
r2
)
;
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
float
r
=
RECIP
(
invR
)
;
...
@@ -168,11 +172,11 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -168,11 +172,11 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
y
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset1
=
x
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
#
endif
global_bornSum[offset1]
+=
bornSum+tempBuffer[get_local_id
(
0
)
+TILE_SIZE]
;
global_bornSum[offset1]
+=
bornSum+tempBuffer[get_local_id
(
0
)
+TILE_SIZE]
;
global_bornSum[offset2]
+=
localData[get_local_id
(
0
)
].bornSum+localData[get_local_id
(
0
)
+TILE_SIZE].bornSum
;
global_bornSum[offset2]
+=
localData[get_local_id
(
0
)
].bornSum+localData[get_local_id
(
0
)
+TILE_SIZE].bornSum
;
...
@@ -206,13 +210,22 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
...
@@ -206,13 +210,22 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
while
(
pos
<
end
)
{
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
#
ifdef
USE_CUTOFF
unsigned
int
x
=
tiles[pos]
;
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*TILE_SIZE
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
;
x
=
(
x>>17
)
*TILE_SIZE
;
x
=
(
x>>17
)
;
#
else
unsigned
int
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
unsigned
int
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y++
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
#
endif
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
forceBufferOffset
=
(
tgx
<
TILE_SIZE/2
?
0
:
TILE_SIZE
)
;
unsigned
int
forceBufferOffset
=
(
tgx
<
TILE_SIZE/2
?
0
:
TILE_SIZE
)
;
unsigned
int
atom1
=
x
+
tgx
;
unsigned
int
atom1
=
x
*TILE_SIZE
+
tgx
;
float4
force
=
0.0f
;
float4
force
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float4
posq1
=
posq[atom1]
;
float
bornRadius1
=
global_bornRadii[atom1]
;
float
bornRadius1
=
global_bornRadii[atom1]
;
...
@@ -225,10 +238,8 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
...
@@ -225,10 +238,8 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
localData[get_local_id
(
0
)
].q
=
posq1.w
;
localData[get_local_id
(
0
)
].q
=
posq1.w
;
localData[get_local_id
(
0
)
].bornRadius
=
bornRadius1
;
localData[get_local_id
(
0
)
].bornRadius
=
bornRadius1
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*
(
xi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y+baseLocalAtom+j
<
NUM_ATOMS
)
;
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y
*TILE_SIZE
+baseLocalAtom+j
<
NUM_ATOMS
)
;
float4
posq2
=
(
float4
)
(
localData[baseLocalAtom+j].x,
localData[baseLocalAtom+j].y,
localData[baseLocalAtom+j].z,
localData[baseLocalAtom+j].q
)
;
float4
posq2
=
(
float4
)
(
localData[baseLocalAtom+j].x,
localData[baseLocalAtom+j].y,
localData[baseLocalAtom+j].z,
localData[baseLocalAtom+j].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
...
@@ -266,9 +277,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
...
@@ -266,9 +277,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset
=
x
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset
=
x
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
#
endif
forceBuffers[offset].xyz
=
forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
forceBuffers[offset].xyz
=
forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
global_bornForce[offset]
+=
force.w+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].w
;
global_bornForce[offset]
+=
force.w+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].w
;
...
@@ -278,7 +289,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
...
@@ -278,7 +289,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
//
This
is
an
off-diagonal
tile.
//
This
is
an
off-diagonal
tile.
if
(
lasty
!=
y
&&
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
lasty
!=
y
&&
get_local_id
(
0
)
<
TILE_SIZE
)
{
unsigned
int
j
=
y
+
tgx
;
unsigned
int
j
=
y
*TILE_SIZE
+
tgx
;
float4
tempPosq
=
posq[j]
;
float4
tempPosq
=
posq[j]
;
localData[get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[get_local_id
(
0
)
].y
=
tempPosq.y
;
localData[get_local_id
(
0
)
].y
=
tempPosq.y
;
...
@@ -294,12 +305,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
...
@@ -294,12 +305,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
//
Compute
the
full
set
of
interactions
in
this
tile.
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
yi
=
y/TILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*
(
yi+1
)
/2
;
unsigned
int
tj
=
tgx%
(
TILE_SIZE/2
)
;
unsigned
int
tj
=
tgx%
(
TILE_SIZE/2
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y+baseLocalAtom+tj
<
NUM_ATOMS
)
;
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y
*TILE_SIZE
+baseLocalAtom+tj
<
NUM_ATOMS
)
;
float4
posq2
=
(
float4
)
(
localData[baseLocalAtom+tj].x,
localData[baseLocalAtom+tj].y,
localData[baseLocalAtom+tj].z,
localData[baseLocalAtom+tj].q
)
;
float4
posq2
=
(
float4
)
(
localData[baseLocalAtom+tj].x,
localData[baseLocalAtom+tj].y,
localData[baseLocalAtom+tj].z,
localData[baseLocalAtom+tj].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
...
@@ -343,11 +351,11 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
...
@@ -343,11 +351,11 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
y
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset1
=
x
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
#
endif
forceBuffers[offset1].xyz
=
forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
forceBuffers[offset1].xyz
=
forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
float4
sum
=
(
float4
)
(
localData[get_local_id
(
0
)
].fx+localData[get_local_id
(
0
)
+TILE_SIZE].fx,
float4
sum
=
(
float4
)
(
localData[get_local_id
(
0
)
].fx+localData[get_local_id
(
0
)
+TILE_SIZE].fx,
...
...
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
View file @
9a4e2b02
...
@@ -33,12 +33,21 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -33,12 +33,21 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
while
(
pos
<
end
)
{
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
#
ifdef
USE_CUTOFF
unsigned
int
x
=
tiles[pos]
;
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*TILE_SIZE
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
;
x
=
(
x>>17
)
*TILE_SIZE
;
x
=
(
x>>17
)
;
#
else
unsigned
int
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
unsigned
int
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y++
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
#
endif
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
unsigned
int
atom1
=
x
+
tgx
;
unsigned
int
atom1
=
x
*TILE_SIZE
+
tgx
;
float
bornSum
=
0.0f
;
float
bornSum
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float4
posq1
=
posq[atom1]
;
float2
params1
=
global_params[atom1]
;
float2
params1
=
global_params[atom1]
;
...
@@ -51,8 +60,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -51,8 +60,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
localData[get_local_id
(
0
)
].q
=
posq1.w
;
localData[get_local_id
(
0
)
].q
=
posq1.w
;
localData[get_local_id
(
0
)
].radius
=
params1.x
;
localData[get_local_id
(
0
)
].radius
=
params1.x
;
localData[get_local_id
(
0
)
].scaledRadius
=
params1.y
;
localData[get_local_id
(
0
)
].scaledRadius
=
params1.y
;
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*
(
xi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
float4
delta
=
(
float4
)
(
localData[tbx+j].x-posq1.x,
localData[tbx+j].y-posq1.y,
localData[tbx+j].z-posq1.z,
0.0f
)
;
float4
delta
=
(
float4
)
(
localData[tbx+j].x-posq1.x,
localData[tbx+j].y-posq1.y,
localData[tbx+j].z-posq1.z,
0.0f
)
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
...
@@ -62,9 +69,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -62,9 +69,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
#
endif
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y+j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*TILE_SIZE
+j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
#
else
if
(
atom1
<
NUM_ATOMS
&&
y+j
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*TILE_SIZE
+j
<
NUM_ATOMS
)
{
#
endif
#
endif
float
invR
=
RSQRT
(
r2
)
;
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
float
r
=
RECIP
(
invR
)
;
...
@@ -86,9 +93,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -86,9 +93,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
//
Write
results
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset
=
x
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
#
endif
#
endif
global_bornSum[offset]
+=
bornSum
;
global_bornSum[offset]
+=
bornSum
;
}
}
...
@@ -96,7 +103,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -96,7 +103,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
//
This
is
an
off-diagonal
tile.
//
This
is
an
off-diagonal
tile.
if
(
lasty
!=
y
)
{
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
+
tgx
;
unsigned
int
j
=
y
*TILE_SIZE
+
tgx
;
float4
tempPosq
=
posq[j]
;
float4
tempPosq
=
posq[j]
;
localData[get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[get_local_id
(
0
)
].y
=
tempPosq.y
;
localData[get_local_id
(
0
)
].y
=
tempPosq.y
;
...
@@ -127,9 +134,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -127,9 +134,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
tempBuffer[get_local_id
(
0
)
]
=
0.0f
;
tempBuffer[get_local_id
(
0
)
]
=
0.0f
;
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y+j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*TILE_SIZE
+j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
#
else
if
(
atom1
<
NUM_ATOMS
&&
y+j
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*TILE_SIZE
+j
<
NUM_ATOMS
)
{
#
endif
#
endif
float
invR
=
RSQRT
(
r2
)
;
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
float
r
=
RECIP
(
invR
)
;
...
@@ -182,9 +189,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -182,9 +189,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
{
{
//
Compute
the
full
set
of
interactions
in
this
tile.
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
yi
=
y/TILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*
(
yi+1
)
/2
;
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
float4
delta
=
(
float4
)
(
localData[tbx+tj].x-posq1.x,
localData[tbx+tj].y-posq1.y,
localData[tbx+tj].z-posq1.z,
0.0f
)
;
float4
delta
=
(
float4
)
(
localData[tbx+tj].x-posq1.x,
localData[tbx+tj].y-posq1.y,
localData[tbx+tj].z-posq1.z,
0.0f
)
;
...
@@ -195,9 +199,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -195,9 +199,9 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
#
endif
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y+tj
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*TILE_SIZE
+tj
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
#
else
if
(
atom1
<
NUM_ATOMS
&&
y+tj
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*TILE_SIZE
+tj
<
NUM_ATOMS
)
{
#
endif
#
endif
float
invR
=
RSQRT
(
r2
)
;
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
float
r
=
RECIP
(
invR
)
;
...
@@ -235,11 +239,11 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -235,11 +239,11 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
//
Write
results
//
Write
results
float4
of
;
float4
of
;
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
y
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset1
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
#
endif
#
endif
global_bornSum[offset1]
+=
bornSum
;
global_bornSum[offset1]
+=
bornSum
;
global_bornSum[offset2]
+=
localData[get_local_id
(
0
)
].bornSum
;
global_bornSum[offset2]
+=
localData[get_local_id
(
0
)
].bornSum
;
...
@@ -274,12 +278,21 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
...
@@ -274,12 +278,21 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
while
(
pos
<
end
)
{
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
#
ifdef
USE_CUTOFF
unsigned
int
x
=
tiles[pos]
;
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*TILE_SIZE
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
;
x
=
(
x>>17
)
*TILE_SIZE
;
x
=
(
x>>17
)
;
#
else
unsigned
int
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
unsigned
int
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y++
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
#
endif
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
unsigned
int
atom1
=
x
+
tgx
;
unsigned
int
atom1
=
x
*TILE_SIZE
+
tgx
;
float4
force
=
0.0f
;
float4
force
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float4
posq1
=
posq[atom1]
;
float
bornRadius1
=
global_bornRadii[atom1]
;
float
bornRadius1
=
global_bornRadii[atom1]
;
...
@@ -291,10 +304,8 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
...
@@ -291,10 +304,8 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
localData[get_local_id
(
0
)
].z
=
posq1.z
;
localData[get_local_id
(
0
)
].z
=
posq1.z
;
localData[get_local_id
(
0
)
].q
=
posq1.w
;
localData[get_local_id
(
0
)
].q
=
posq1.w
;
localData[get_local_id
(
0
)
].bornRadius
=
bornRadius1
;
localData[get_local_id
(
0
)
].bornRadius
=
bornRadius1
;
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*
(
xi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
if
(
atom1
<
NUM_ATOMS
&&
y+j
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*TILE_SIZE
+j
<
NUM_ATOMS
)
{
float4
posq2
=
(
float4
)
(
localData[tbx+j].x,
localData[tbx+j].y,
localData[tbx+j].z,
localData[tbx+j].q
)
;
float4
posq2
=
(
float4
)
(
localData[tbx+j].x,
localData[tbx+j].y,
localData[tbx+j].z,
localData[tbx+j].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
...
@@ -330,9 +341,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
...
@@ -330,9 +341,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
//
Write
results
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset
=
x
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
#
endif
#
endif
forceBuffers[offset].xyz
+=
force.xyz
;
forceBuffers[offset].xyz
+=
force.xyz
;
global_bornForce[offset]
+=
force.w
;
global_bornForce[offset]
+=
force.w
;
...
@@ -341,7 +352,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
...
@@ -341,7 +352,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
//
This
is
an
off-diagonal
tile.
//
This
is
an
off-diagonal
tile.
if
(
lasty
!=
y
)
{
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
+
tgx
;
unsigned
int
j
=
y
*TILE_SIZE
+
tgx
;
float4
tempPosq
=
posq[j]
;
float4
tempPosq
=
posq[j]
;
localData[get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[get_local_id
(
0
)
].y
=
tempPosq.y
;
localData[get_local_id
(
0
)
].y
=
tempPosq.y
;
...
@@ -386,9 +397,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
...
@@ -386,9 +397,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
float
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
float
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
if
(
atom1
>=
NUM_ATOMS
|
| y+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
if
(
atom1
>=
NUM_ATOMS
|
| y
*TILE_SIZE
+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
#else
#else
if (atom1 >= NUM_ATOMS |
|
y+j
>=
NUM_ATOMS
)
{
if (atom1 >= NUM_ATOMS |
|
y
*TILE_SIZE
+j
>=
NUM_ATOMS
)
{
#
endif
#
endif
dEdR
=
0.0f
;
dEdR
=
0.0f
;
tempEnergy
=
0.0f
;
tempEnergy
=
0.0f
;
...
@@ -424,12 +435,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
...
@@ -424,12 +435,9 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
{
{
//
Compute
the
full
set
of
interactions
in
this
tile.
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
yi
=
y/TILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*
(
yi+1
)
/2
;
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
if
(
atom1
<
NUM_ATOMS
&&
y+tj
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*TILE_SIZE
+tj
<
NUM_ATOMS
)
{
float4
posq2
=
(
float4
)
(
localData[tbx+tj].x,
localData[tbx+tj].y,
localData[tbx+tj].z,
localData[tbx+tj].q
)
;
float4
posq2
=
(
float4
)
(
localData[tbx+tj].x,
localData[tbx+tj].y,
localData[tbx+tj].z,
localData[tbx+tj].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
...
@@ -471,11 +479,11 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
...
@@ -471,11 +479,11 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
//
Write
results
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
y
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset1
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
#
endif
#
endif
forceBuffers[offset1].xyz
+=
force.xyz
;
forceBuffers[offset1].xyz
+=
force.xyz
;
forceBuffers[offset2]
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0
)
;
forceBuffers[offset2]
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0
)
;
...
...
platforms/opencl/src/kernels/nonbonded_default.cl
View file @
9a4e2b02
...
@@ -13,7 +13,7 @@ typedef struct {
...
@@ -13,7 +13,7 @@ typedef struct {
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
__global
unsigned
int*
tiles,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
__global
unsigned
int*
tiles,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
#
else
#
else
...
@@ -27,20 +27,45 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -27,20 +27,45 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
float
energy
=
0.0f
;
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2]
;
__local
int
exclusionIndex[1]
;
while
(
pos
<
end
)
{
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
#
ifdef
USE_CUTOFF
unsigned
int
x
=
tiles[pos]
;
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*TILE_SIZE
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
;
bool
hasExclusions
=
(
x
&
0x1
)
;
x
=
(
x>>17
)
;
x
=
(
x>>17
)
*TILE_SIZE
;
#
else
unsigned
int
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
unsigned
int
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y++
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
#
endif
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
forceBufferOffset
=
(
tgx
<
TILE_SIZE/2
?
0
:
TILE_SIZE
)
;
unsigned
int
forceBufferOffset
=
(
tgx
<
TILE_SIZE/2
?
0
:
TILE_SIZE
)
;
unsigned
int
atom1
=
x
+
tgx
;
unsigned
int
atom1
=
x
*TILE_SIZE
+
tgx
;
float4
force
=
0.0f
;
float4
force
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
//
Locate
the
exclusion
data
for
this
tile.
#
ifdef
USE_EXCLUSIONS
if
(
get_local_id
(
0
)
<
2
)
exclusionRange[get_local_id
(
0
)
]
=
exclusionRowIndices[x+get_local_id
(
0
)
]
;
if
(
tgx
==
0
)
exclusionIndex[0]
=
-1
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
i
=
exclusionRange[0]+tgx
; i < exclusionRange[1]; i += TILE_SIZE)
if
(
exclusionIndices[i]
==
y
)
exclusionIndex[0]
=
i*TILE_SIZE
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
bool
hasExclusions
=
(
exclusionIndex[0]
>
-1
)
;
#
endif
if
(
x
==
y
)
{
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
//
This
tile
is
on
the
diagonal.
...
@@ -50,10 +75,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -50,10 +75,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
localData[get_local_id
(
0
)
].q
=
posq1.w
;
localData[get_local_id
(
0
)
].q
=
posq1.w
;
LOAD_LOCAL_PARAMETERS_FROM_1
LOAD_LOCAL_PARAMETERS_FROM_1
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*
(
xi+1
)
/2
;
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
exclusions[exclusionInd
ices[tile
]+tgx]
>>
baseLocalAtom
;
unsigned
int
excl
=
exclusions[exclusionInd
ex[0
]+tgx]
>>
baseLocalAtom
;
#
endif
#
endif
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
...
@@ -71,7 +94,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -71,7 +94,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
float
invR
=
RSQRT
(
r2
)
;
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
float
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y+baseLocalAtom+j
;
atom2
=
y
*TILE_SIZE
+baseLocalAtom+j
;
#
ifdef
USE_SYMMETRIC
#
ifdef
USE_SYMMETRIC
float
dEdR
=
0.0f
;
float
dEdR
=
0.0f
;
#
else
#
else
...
@@ -96,9 +119,9 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -96,9 +119,9 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset
=
x
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset
=
x
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
#
endif
forceBuffers[offset].xyz
=
forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
forceBuffers[offset].xyz
=
forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
}
}
...
@@ -107,7 +130,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -107,7 +130,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
//
This
is
an
off-diagonal
tile.
//
This
is
an
off-diagonal
tile.
if
(
lasty
!=
y
&&
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
lasty
!=
y
&&
get_local_id
(
0
)
<
TILE_SIZE
)
{
unsigned
int
j
=
y
+
tgx
;
unsigned
int
j
=
y
*TILE_SIZE
+
tgx
;
float4
tempPosq
=
posq[j]
;
float4
tempPosq
=
posq[j]
;
localData[get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[get_local_id
(
0
)
].y
=
tempPosq.y
;
localData[get_local_id
(
0
)
].y
=
tempPosq.y
;
...
@@ -122,11 +145,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -122,11 +145,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
//
Compute
the
full
set
of
interactions
in
this
tile.
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
yi
=
y/TILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*
(
yi+1
)
/2
;
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionInd
ices[tile
]+tgx]
:
0xFFFFFFFF
)
;
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionInd
ex[0
]+tgx]
:
0xFFFFFFFF
)
;
excl
=
(
excl
>>
baseLocalAtom
)
&
0xFFFF
;
excl
=
(
excl
>>
baseLocalAtom
)
&
0xFFFF
;
excl
+=
excl
<<
16
;
excl
+=
excl
<<
16
;
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
...
@@ -148,7 +168,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -148,7 +168,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
float
invR
=
RSQRT
(
r2
)
;
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
float
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y+baseLocalAtom+tj
;
atom2
=
y
*TILE_SIZE
+baseLocalAtom+tj
;
#
ifdef
USE_SYMMETRIC
#
ifdef
USE_SYMMETRIC
float
dEdR
=
0.0f
;
float
dEdR
=
0.0f
;
#
else
#
else
...
@@ -182,11 +202,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -182,11 +202,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
y
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset1
=
x
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
#
endif
forceBuffers[offset1].xyz
=
forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
forceBuffers[offset1].xyz
=
forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
float4
sum
=
(
float4
)
(
localData[get_local_id
(
0
)
].fx+localData[get_local_id
(
0
)
+TILE_SIZE].fx,
localData[get_local_id
(
0
)
].fy+localData[get_local_id
(
0
)
+TILE_SIZE].fy,
localData[get_local_id
(
0
)
].fz+localData[get_local_id
(
0
)
+TILE_SIZE].fz,
0.0f
)
;
float4
sum
=
(
float4
)
(
localData[get_local_id
(
0
)
].fx+localData[get_local_id
(
0
)
+TILE_SIZE].fx,
localData[get_local_id
(
0
)
].fy+localData[get_local_id
(
0
)
+TILE_SIZE].fy,
localData[get_local_id
(
0
)
].fz+localData[get_local_id
(
0
)
+TILE_SIZE].fz,
0.0f
)
;
...
...
platforms/opencl/src/kernels/nonbonded_nvidia.cl
View file @
9a4e2b02
...
@@ -13,7 +13,7 @@ typedef struct {
...
@@ -13,7 +13,7 @@ typedef struct {
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
__global
unsigned
int*
tiles,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
__global
unsigned
int*
tiles,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
#
else
#
else
...
@@ -29,19 +29,45 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -29,19 +29,45 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
float
energy
=
0.0f
;
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[4]
;
__local
int
exclusionIndex[2]
;
while
(
pos
<
end
)
{
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
#
ifdef
USE_CUTOFF
unsigned
int
x
=
tiles[pos]
;
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*TILE_SIZE
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
;
bool
hasExclusions
=
(
x
&
0x1
)
;
x
=
(
x>>17
)
;
x
=
(
x>>17
)
*TILE_SIZE
;
#
else
unsigned
int
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
unsigned
int
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y++
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
#
endif
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
unsigned
int
atom1
=
x
+
tgx
;
unsigned
int
atom1
=
x
*TILE_SIZE
+
tgx
;
float4
force
=
0.0f
;
float4
force
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
//
Locate
the
exclusion
data
for
this
tile.
#
ifdef
USE_EXCLUSIONS
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
if
(
tgx
<
2
)
exclusionRange[2*localGroupIndex+tgx]
=
exclusionRowIndices[x+tgx]
;
if
(
tgx
==
0
)
exclusionIndex[localGroupIndex]
=
-1
;
for
(
int
i
=
exclusionRange[2*localGroupIndex]+tgx
; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if
(
exclusionIndices[i]
==
y
)
exclusionIndex[localGroupIndex]
=
i*TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex[localGroupIndex]
>
-1
)
;
#
else
bool
hasExclusions
=
false
;
#
endif
if
(
x
==
y
)
{
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
//
This
tile
is
on
the
diagonal.
...
@@ -50,10 +76,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -50,10 +76,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
localData[get_local_id
(
0
)
].z
=
posq1.z
;
localData[get_local_id
(
0
)
].z
=
posq1.z
;
localData[get_local_id
(
0
)
].q
=
posq1.w
;
localData[get_local_id
(
0
)
].q
=
posq1.w
;
LOAD_LOCAL_PARAMETERS_FROM_1
LOAD_LOCAL_PARAMETERS_FROM_1
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*
(
xi+1
)
/2
;
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
exclusions[exclusionInd
ices[tile
]+tgx]
;
unsigned
int
excl
=
exclusions[exclusionInd
ex[localGroupIndex
]+tgx]
;
#
endif
#
endif
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
...
@@ -71,7 +95,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -71,7 +95,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
float
r
=
sqrt
(
r2
)
;
float
r
=
sqrt
(
r2
)
;
float
invR
=
RECIP
(
r
)
;
float
invR
=
RECIP
(
r
)
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y+j
;
atom2
=
y
*TILE_SIZE
+j
;
#
ifdef
USE_SYMMETRIC
#
ifdef
USE_SYMMETRIC
float
dEdR
=
0.0f
;
float
dEdR
=
0.0f
;
#
else
#
else
...
@@ -91,9 +115,9 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -91,9 +115,9 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
//
Write
results
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset
=
x
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
#
endif
#
endif
forceBuffers[offset].xyz
+=
force.xyz
;
forceBuffers[offset].xyz
+=
force.xyz
;
}
}
...
@@ -101,7 +125,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -101,7 +125,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
//
This
is
an
off-diagonal
tile.
//
This
is
an
off-diagonal
tile.
if
(
lasty
!=
y
)
{
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
+
tgx
;
unsigned
int
j
=
y
*TILE_SIZE
+
tgx
;
float4
tempPosq
=
posq[j]
;
float4
tempPosq
=
posq[j]
;
localData[get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[get_local_id
(
0
)
].y
=
tempPosq.y
;
localData[get_local_id
(
0
)
].y
=
tempPosq.y
;
...
@@ -136,7 +160,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -136,7 +160,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
float
invR
=
RSQRT
(
r2
)
;
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
float
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y+j
;
atom2
=
y
*TILE_SIZE
+j
;
#
ifdef
USE_SYMMETRIC
#
ifdef
USE_SYMMETRIC
float
dEdR
=
0.0f
;
float
dEdR
=
0.0f
;
#
else
#
else
...
@@ -179,11 +203,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -179,11 +203,8 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
{
{
//
Compute
the
full
set
of
interactions
in
this
tile.
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
yi
=
y/TILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*
(
yi+1
)
/2
;
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionInd
ices[tile
]+tgx]
:
0xFFFFFFFF
)
;
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionInd
ex[localGroupIndex
]+tgx]
:
0xFFFFFFFF
)
;
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
#
endif
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
...
@@ -203,7 +224,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -203,7 +224,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
float
invR
=
RSQRT
(
r2
)
;
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
float
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y+tj
;
atom2
=
y
*TILE_SIZE
+tj
;
#
ifdef
USE_SYMMETRIC
#
ifdef
USE_SYMMETRIC
float
dEdR
=
0.0f
;
float
dEdR
=
0.0f
;
#
else
#
else
...
@@ -232,11 +253,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -232,11 +253,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
//
Write
results
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
y
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
x
*PADDED_NUM_ATOMS
;
#
else
#
else
unsigned
int
offset1
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
#
endif
#
endif
forceBuffers[offset1].xyz
+=
force.xyz
;
forceBuffers[offset1].xyz
+=
force.xyz
;
forceBuffers[offset2]
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0.0f
)
;
forceBuffers[offset2]
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0.0f
)
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment