Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
7943a339
Commit
7943a339
authored
May 09, 2011
by
Peter Eastman
Browse files
Restructured the use of force buffers in a new way that hopefully really works everywhere.
parent
13ef0ee8
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
1067 additions
and
961 deletions
+1067
-961
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+16
-17
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+8
-17
platforms/opencl/src/OpenCLNonbondedUtilities.h
platforms/opencl/src/OpenCLNonbondedUtilities.h
+0
-7
platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
+1
-1
platforms/opencl/src/kernels/customGBEnergyN2_default.cl
platforms/opencl/src/kernels/customGBEnergyN2_default.cl
+1
-1
platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
+170
-149
platforms/opencl/src/kernels/customGBValueN2_cpu.cl
platforms/opencl/src/kernels/customGBValueN2_cpu.cl
+1
-1
platforms/opencl/src/kernels/customGBValueN2_default.cl
platforms/opencl/src/kernels/customGBValueN2_default.cl
+1
-1
platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
+196
-175
platforms/opencl/src/kernels/gbsaObc_cpu.cl
platforms/opencl/src/kernels/gbsaObc_cpu.cl
+2
-2
platforms/opencl/src/kernels/gbsaObc_default.cl
platforms/opencl/src/kernels/gbsaObc_default.cl
+2
-2
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+430
-372
platforms/opencl/src/kernels/nonbonded_cpu.cl
platforms/opencl/src/kernels/nonbonded_cpu.cl
+1
-1
platforms/opencl/src/kernels/nonbonded_default.cl
platforms/opencl/src/kernels/nonbonded_default.cl
+1
-1
platforms/opencl/src/kernels/nonbonded_nvidia.cl
platforms/opencl/src/kernels/nonbonded_nvidia.cl
+237
-214
No files found.
platforms/opencl/src/OpenCLKernels.cpp
View file @
7943a339
...
@@ -1738,6 +1738,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1738,6 +1738,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
defines
[
"NUM_ATOMS"
]
=
intToString
(
cl
.
getNumAtoms
());
defines
[
"NUM_ATOMS"
]
=
intToString
(
cl
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
intToString
(
cl
.
getPaddedNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
intToString
(
cl
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
cl
.
getNumAtomBlocks
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
cl
.
getNumAtomBlocks
());
if
(
cl
.
getSIMDWidth
()
==
32
)
defines
[
"WARPS_PER_GROUP"
]
=
OpenCLExpressionUtilities
::
intToString
(
cl
.
getNonbondedUtilities
().
getForceThreadBlockSize
()
/
OpenCLContext
::
TileSize
);
string
file
;
string
file
;
if
(
deviceIsCpu
)
if
(
deviceIsCpu
)
file
=
OpenCLKernelSources
::
gbsaObc_cpu
;
file
=
OpenCLKernelSources
::
gbsaObc_cpu
;
...
@@ -1753,7 +1755,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1753,7 +1755,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
params
->
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
params
->
getDeviceBuffer
());
computeBornSumKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
13
*
sizeof
(
cl_float
),
NULL
);
computeBornSumKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
13
*
sizeof
(
cl_float
),
NULL
);
computeBornSumKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
1
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float
),
NULL
);
computeBornSumKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
1
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float
),
NULL
);
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getForceBufferFlags
().
getDeviceBuffer
());
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
@@ -1773,7 +1774,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1773,7 +1774,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
bornForce
->
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
bornForce
->
getDeviceBuffer
());
force1Kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
13
*
sizeof
(
cl_float
),
NULL
);
force1Kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
13
*
sizeof
(
cl_float
),
NULL
);
force1Kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
1
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
mm_float4
),
NULL
);
force1Kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
1
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
mm_float4
),
NULL
);
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getForceBufferFlags
().
getDeviceBuffer
());
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
@@ -1805,14 +1805,14 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1805,14 +1805,14 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornForceKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
obcChain
->
getDeviceBuffer
());
reduceBornForceKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
obcChain
->
getDeviceBuffer
());
}
}
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
computeBornSumKernel
.
setArg
<
mm_float4
>
(
8
,
cl
.
getPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
7
,
cl
.
getPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
9
,
cl
.
getInvPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
8
,
cl
.
getInvPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
10
,
cl
.
getPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
9
,
cl
.
getPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
1
1
,
cl
.
getInvPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
1
0
,
cl
.
getInvPeriodicBoxSize
());
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
computeBornSumKernel
.
setArg
<
cl_uint
>
(
10
,
maxTiles
);
computeBornSumKernel
.
setArg
<
cl_uint
>
(
10
,
maxTiles
);
force1Kernel
.
setArg
<
cl_uint
>
(
1
2
,
maxTiles
);
force1Kernel
.
setArg
<
cl_uint
>
(
1
1
,
maxTiles
);
}
}
}
}
cl
.
executeKernel
(
computeBornSumKernel
,
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
cl
.
executeKernel
(
computeBornSumKernel
,
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
...
@@ -2148,7 +2148,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
...
@@ -2148,7 +2148,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
}
}
map
<
string
,
string
>
replacements
;
map
<
string
,
string
>
replacements
;
replacements
[
"COMPUTE_INTERACTION"
]
=
n2EnergySource
.
str
();
replacements
[
"COMPUTE_INTERACTION"
]
=
n2EnergySource
.
str
();
stringstream
extraArgs
,
loadLocal1
,
loadLocal2
,
clearLocal
,
load1
,
load2
,
recordDeriv
,
storeDerivs1
,
storeDerivs2
,
declareTemps
,
setTemps
;
stringstream
extraArgs
,
loadLocal1
,
loadLocal2
,
clearLocal
,
load1
,
load2
,
declare1
,
recordDeriv
,
storeDerivs1
,
storeDerivs2
,
declareTemps
,
setTemps
;
if
(
force
.
getNumGlobalParameters
()
>
0
)
if
(
force
.
getNumGlobalParameters
()
>
0
)
extraArgs
<<
", __constant float* globals"
;
extraArgs
<<
", __constant float* globals"
;
for
(
int
i
=
0
;
i
<
(
int
)
params
->
getBuffers
().
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
params
->
getBuffers
().
size
();
i
++
)
{
...
@@ -2174,7 +2174,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
...
@@ -2174,7 +2174,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
string
index
=
intToString
(
i
+
1
);
string
index
=
intToString
(
i
+
1
);
extraArgs
<<
", __global "
<<
buffer
.
getType
()
<<
"* derivBuffers"
<<
index
<<
", __local "
<<
buffer
.
getType
()
<<
"* local_deriv"
<<
index
;
extraArgs
<<
", __global "
<<
buffer
.
getType
()
<<
"* derivBuffers"
<<
index
<<
", __local "
<<
buffer
.
getType
()
<<
"* local_deriv"
<<
index
;
clearLocal
<<
"local_deriv"
<<
index
<<
"[localAtomIndex] = 0.0f;
\n
"
;
clearLocal
<<
"local_deriv"
<<
index
<<
"[localAtomIndex] = 0.0f;
\n
"
;
load
1
<<
buffer
.
getType
()
<<
" deriv"
<<
index
<<
"_1 = 0.0f;
\n
"
;
declare
1
<<
buffer
.
getType
()
<<
" deriv"
<<
index
<<
"_1 = 0.0f;
\n
"
;
load2
<<
buffer
.
getType
()
<<
" deriv"
<<
index
<<
"_2 = 0.0f;
\n
"
;
load2
<<
buffer
.
getType
()
<<
" deriv"
<<
index
<<
"_2 = 0.0f;
\n
"
;
recordDeriv
<<
"local_deriv"
<<
index
<<
"[atom2] += deriv"
<<
index
<<
"_2;
\n
"
;
recordDeriv
<<
"local_deriv"
<<
index
<<
"[atom2] += deriv"
<<
index
<<
"_2;
\n
"
;
storeDerivs1
<<
"STORE_DERIVATIVE_1("
<<
index
<<
")"
;
storeDerivs1
<<
"STORE_DERIVATIVE_1("
<<
index
<<
")"
;
...
@@ -2188,6 +2188,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
...
@@ -2188,6 +2188,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
replacements
[
"CLEAR_LOCAL_DERIVATIVES"
]
=
clearLocal
.
str
();
replacements
[
"CLEAR_LOCAL_DERIVATIVES"
]
=
clearLocal
.
str
();
replacements
[
"LOAD_ATOM1_PARAMETERS"
]
=
load1
.
str
();
replacements
[
"LOAD_ATOM1_PARAMETERS"
]
=
load1
.
str
();
replacements
[
"LOAD_ATOM2_PARAMETERS"
]
=
load2
.
str
();
replacements
[
"LOAD_ATOM2_PARAMETERS"
]
=
load2
.
str
();
replacements
[
"DECLARE_ATOM1_DERIVATIVES"
]
=
declare1
.
str
();
replacements
[
"RECORD_DERIVATIVE_2"
]
=
recordDeriv
.
str
();
replacements
[
"RECORD_DERIVATIVE_2"
]
=
recordDeriv
.
str
();
replacements
[
"STORE_DERIVATIVES_1"
]
=
storeDerivs1
.
str
();
replacements
[
"STORE_DERIVATIVES_1"
]
=
storeDerivs1
.
str
();
replacements
[
"STORE_DERIVATIVES_2"
]
=
storeDerivs2
.
str
();
replacements
[
"STORE_DERIVATIVES_2"
]
=
storeDerivs2
.
str
();
...
@@ -2482,7 +2483,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
...
@@ -2482,7 +2483,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
valueBuffers
->
getDeviceBuffer
());
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
valueBuffers
->
getDeviceBuffer
());
pairValueKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float
),
NULL
);
pairValueKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float
),
NULL
);
pairValueKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float
),
NULL
);
pairValueKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float
),
NULL
);
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getForceBufferFlags
().
getDeviceBuffer
());
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
@@ -2531,7 +2531,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
...
@@ -2531,7 +2531,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusionIndices
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusionIndices
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusionRowIndices
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusionRowIndices
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float4
),
NULL
);
pairEnergyKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float4
),
NULL
);
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getForceBufferFlags
().
getDeviceBuffer
());
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
@@ -2609,14 +2608,14 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
...
@@ -2609,14 +2608,14 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
globals
->
upload
(
globalParamValues
);
globals
->
upload
(
globalParamValues
);
}
}
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
pairValueKernel
.
setArg
<
mm_float4
>
(
1
1
,
cl
.
getPeriodicBoxSize
());
pairValueKernel
.
setArg
<
mm_float4
>
(
1
0
,
cl
.
getPeriodicBoxSize
());
pairValueKernel
.
setArg
<
mm_float4
>
(
1
2
,
cl
.
getInvPeriodicBoxSize
());
pairValueKernel
.
setArg
<
mm_float4
>
(
1
1
,
cl
.
getInvPeriodicBoxSize
());
pairEnergyKernel
.
setArg
<
mm_float4
>
(
1
2
,
cl
.
getPeriodicBoxSize
());
pairEnergyKernel
.
setArg
<
mm_float4
>
(
1
1
,
cl
.
getPeriodicBoxSize
());
pairEnergyKernel
.
setArg
<
mm_float4
>
(
1
3
,
cl
.
getInvPeriodicBoxSize
());
pairEnergyKernel
.
setArg
<
mm_float4
>
(
1
2
,
cl
.
getInvPeriodicBoxSize
());
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
pairValueKernel
.
setArg
<
cl_uint
>
(
1
3
,
maxTiles
);
pairValueKernel
.
setArg
<
cl_uint
>
(
1
2
,
maxTiles
);
pairEnergyKernel
.
setArg
<
cl_uint
>
(
1
4
,
maxTiles
);
pairEnergyKernel
.
setArg
<
cl_uint
>
(
1
3
,
maxTiles
);
}
}
}
}
cl
.
executeKernel
(
pairValueKernel
,
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
cl
.
executeKernel
(
pairValueKernel
,
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
7943a339
...
@@ -37,7 +37,7 @@ using namespace std;
...
@@ -37,7 +37,7 @@ using namespace std;
OpenCLNonbondedUtilities
::
OpenCLNonbondedUtilities
(
OpenCLContext
&
context
)
:
context
(
context
),
cutoff
(
-
1.0
),
useCutoff
(
false
),
OpenCLNonbondedUtilities
::
OpenCLNonbondedUtilities
(
OpenCLContext
&
context
)
:
context
(
context
),
cutoff
(
-
1.0
),
useCutoff
(
false
),
numForceBuffers
(
0
),
exclusionIndices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactionFlags
(
NULL
),
numForceBuffers
(
0
),
exclusionIndices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactionFlags
(
NULL
),
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
)
,
forceBufferFlags
(
NULL
)
{
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
)
{
// Decide how many thread blocks and force buffers to use.
// Decide how many thread blocks and force buffers to use.
deviceIsCpu
=
(
context
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
);
deviceIsCpu
=
(
context
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
);
...
@@ -48,8 +48,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
...
@@ -48,8 +48,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
numForceBuffers
=
numForceThreadBlocks
;
numForceBuffers
=
numForceThreadBlocks
;
}
}
else
if
(
context
.
getSIMDWidth
()
==
32
)
{
else
if
(
context
.
getSIMDWidth
()
==
32
)
{
numForceThreadBlocks
=
2
*
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_COMPUTE_UNITS
>
();
numForceThreadBlocks
=
4
*
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_COMPUTE_UNITS
>
();
forceThreadBlockSize
=
256
;
forceThreadBlockSize
=
128
;
numForceBuffers
=
numForceThreadBlocks
;
numForceBuffers
=
numForceThreadBlocks
;
}
}
else
{
else
{
...
@@ -82,8 +82,6 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
...
@@ -82,8 +82,6 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
delete
blockCenter
;
delete
blockCenter
;
if
(
blockBoundingBox
!=
NULL
)
if
(
blockBoundingBox
!=
NULL
)
delete
blockBoundingBox
;
delete
blockBoundingBox
;
if
(
forceBufferFlags
!=
NULL
)
delete
forceBufferFlags
;
}
}
void
OpenCLNonbondedUtilities
::
addInteraction
(
bool
usesCutoff
,
bool
usesPeriodic
,
bool
usesExclusions
,
double
cutoffDistance
,
const
vector
<
vector
<
int
>
>&
exclusionList
,
const
string
&
kernel
)
{
void
OpenCLNonbondedUtilities
::
addInteraction
(
bool
usesCutoff
,
bool
usesPeriodic
,
bool
usesExclusions
,
double
cutoffDistance
,
const
vector
<
vector
<
int
>
>&
exclusionList
,
const
string
&
kernel
)
{
...
@@ -239,12 +237,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
...
@@ -239,12 +237,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
interactionCount
->
upload
();
interactionCount
->
upload
();
}
}
// Create the flags for reserving force buffers.
forceBufferFlags
=
new
OpenCLArray
<
cl_uint
>
(
context
,
numAtomBlocks
*
numForceThreadBlocks
,
"forceBufferFlags"
,
false
);
vector
<
cl_uint
>
forceBufferFlagsVec
(
forceBufferFlags
->
getSize
(),
0
);
forceBufferFlags
->
upload
(
forceBufferFlagsVec
);
// Create kernels.
// Create kernels.
forceKernel
=
createInteractionKernel
(
kernelSource
,
parameters
,
arguments
,
true
,
true
);
forceKernel
=
createInteractionKernel
(
kernelSource
,
parameters
,
arguments
,
true
,
true
);
...
@@ -320,8 +312,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
...
@@ -320,8 +312,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
if
(
cutoff
!=
-
1.0
)
{
if
(
cutoff
!=
-
1.0
)
{
if
(
useCutoff
)
{
if
(
useCutoff
)
{
forceKernel
.
setArg
<
mm_float4
>
(
1
3
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
2
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
4
,
context
.
getInvPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
3
,
context
.
getInvPeriodicBoxSize
());
}
}
context
.
executeKernel
(
forceKernel
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
context
.
executeKernel
(
forceKernel
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
}
}
...
@@ -343,14 +335,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
...
@@ -343,14 +335,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
newSize
=
numTiles
;
newSize
=
numTiles
;
delete
interactingTiles
;
delete
interactingTiles
;
interactingTiles
=
new
OpenCLArray
<
mm_ushort2
>
(
context
,
newSize
,
"interactingTiles"
);
interactingTiles
=
new
OpenCLArray
<
mm_ushort2
>
(
context
,
newSize
,
"interactingTiles"
);
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
1
,
interactingTiles
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
0
,
interactingTiles
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl_uint
>
(
1
5
,
newSize
);
forceKernel
.
setArg
<
cl_uint
>
(
1
4
,
newSize
);
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
interactingTiles
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
interactingTiles
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
9
,
newSize
);
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
9
,
newSize
);
if
(
context
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
{
if
(
context
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
{
delete
interactionFlags
;
delete
interactionFlags
;
interactionFlags
=
new
OpenCLArray
<
cl_uint
>
(
context
,
deviceIsCpu
?
2
*
newSize
:
newSize
,
"interactionFlags"
);
interactionFlags
=
new
OpenCLArray
<
cl_uint
>
(
context
,
deviceIsCpu
?
2
*
newSize
:
newSize
,
"interactionFlags"
);
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
6
,
interactionFlags
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
5
,
interactionFlags
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
interactingTiles
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
interactingTiles
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
...
@@ -503,7 +495,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
...
@@ -503,7 +495,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel
.
setArg
(
index
++
,
4
*
forceThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
kernel
.
setArg
(
index
++
,
4
*
forceThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
+
numTiles
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
+
numTiles
);
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
forceBufferFlags
->
getDeviceBuffer
());
if
(
useCutoff
)
{
if
(
useCutoff
)
{
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
interactingTiles
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
interactingTiles
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
interactionCount
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
interactionCount
->
getDeviceBuffer
());
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.h
View file @
7943a339
...
@@ -196,12 +196,6 @@ public:
...
@@ -196,12 +196,6 @@ public:
OpenCLArray
<
cl_uint
>&
getExclusionRowIndices
()
{
OpenCLArray
<
cl_uint
>&
getExclusionRowIndices
()
{
return
*
exclusionRowIndices
;
return
*
exclusionRowIndices
;
}
}
/**
* Get the array which contains flags for reserving force buffers.
*/
OpenCLArray
<
cl_uint
>&
getForceBufferFlags
()
{
return
*
forceBufferFlags
;
}
/**
/**
* Get the index of the first tile this context is responsible for processing.
* Get the index of the first tile this context is responsible for processing.
*/
*/
...
@@ -245,7 +239,6 @@ private:
...
@@ -245,7 +239,6 @@ private:
OpenCLArray
<
cl_uint
>*
interactionCount
;
OpenCLArray
<
cl_uint
>*
interactionCount
;
OpenCLArray
<
mm_float4
>*
blockCenter
;
OpenCLArray
<
mm_float4
>*
blockCenter
;
OpenCLArray
<
mm_float4
>*
blockBoundingBox
;
OpenCLArray
<
mm_float4
>*
blockBoundingBox
;
OpenCLArray
<
cl_uint
>*
forceBufferFlags
;
std
::
vector
<
std
::
vector
<
int
>
>
atomExclusions
;
std
::
vector
<
std
::
vector
<
int
>
>
atomExclusions
;
std
::
vector
<
ParameterInfo
>
parameters
;
std
::
vector
<
ParameterInfo
>
parameters
;
std
::
vector
<
ParameterInfo
>
arguments
;
std
::
vector
<
ParameterInfo
>
arguments
;
...
...
platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
View file @
7943a339
...
@@ -8,7 +8,7 @@
...
@@ -8,7 +8,7 @@
__kernel
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
__kernel
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempBuffer,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
#
else
...
...
platforms/opencl/src/kernels/customGBEnergyN2_default.cl
View file @
7943a339
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempForceBuffer,
__global
unsigned
int*
forceBufferFlags,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempForceBuffer,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
#
else
#
else
...
...
platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
View file @
7943a339
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
define
TILE_SIZE
32
#
define
TILE_SIZE
32
#
define
STORE_DERIVATIVE_1
(
INDEX
)
derivBuffers##INDEX[offset1]
+=
deriv##INDEX##_1
;
#
define
STORE_DERIVATIVE_1
(
INDEX
)
derivBuffers##INDEX[offset]
+=
deriv##INDEX##_1
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
derivBuffers##INDEX[offset2]
+=
local_deriv##INDEX[get_local_id
(
0
)
]
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
derivBuffers##INDEX[offset]
+=
local_deriv##INDEX[get_local_id
(
0
)
]
;
/**
*
Mark
that
a
block
in
the
force
buffer
is
in
use.
*/
void
reserveBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
while
(
atom_cmpxchg
(
&forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
],
0
,
1
)
!=
0
)
;
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
}
/**
*
Mark
that
a
block
in
the
force
buffer
is
no
longer
in
use.
*/
void
releaseBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
]
=
0
;
}
/**
/**
*
Compute
a
force
based
on
pair
interactions.
*
Compute
a
force
based
on
pair
interactions.
*/
*/
__kernel
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
__kernel
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempBuffer,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
#
else
...
@@ -48,132 +29,67 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
...
@@ -48,132 +29,67 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
while
(
pos
<
end
)
{
do
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
unsigned
int
x,
y
;
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
const
unsigned
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
unsigned int
atom1 = x*TILE_SIZE + tgx
;
unsigned
int
x,
y
;
float4
force
=
0.0f
;
float4
force
=
0.0f
;
float4 posq1 = posq[atom1];
DECLARE_ATOM1_DERIVATIVES
LOAD_ATOM1_PARAMETERS
if
(
pos
<
end
)
{
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
if
(
numTiles
<=
maxTiles
)
{
#endif
ushort2
tileIndices
=
tiles[pos]
;
float r = SQRT(r2);
x
=
tileIndices.x
;
LOAD_ATOM2_PARAMETERS
y
=
tileIndices.y
;
atom2 = y*TILE_SIZE+j;
}
float dEdR = 0.0f;
else
float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#
endif
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Write results
// Locate the exclusion data for this tile.
reserveBuffer(x, forceBufferFlags);
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
STORE_DERIVATIVES_1
releaseBuffer(x, forceBufferFlags);
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0);
#ifdef USE_EXCLUSIONS
if (lasty != y) {
if (tgx < 2)
unsigned int j = y*TILE_SIZE + tgx;
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
local_posq[localAtomIndex] = posq[j];
if (tgx == 0)
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
exclusionIndex[localGroupIndex] = -1;
}
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
local_force[localAtomIndex] = 0.0f;
if (exclusionIndices[i] == y)
CLEAR_LOCAL_DERIVATIVES
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
#ifdef USE_CUTOFF
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
#else
if (!hasExclusions && flags == 0) {
bool hasExclusions = false;
// No interactions in this tile.
}
else
#endif
#endif
{
if (pos >= end)
// Compute the full set of interactions in this tile.
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#endif
#endif
unsigned
int
tj
=
tgx
;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
bool isExcluded = !(excl & 0x1);
#endif
#endif
int
atom2
=
tbx+
t
j
;
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
...
@@ -187,44 +103,149 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
...
@@ -187,44 +103,149 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
#endif
#endif
float r = SQRT(r2);
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+
t
j
;
atom2 = y*TILE_SIZE+j;
float dEdR = 0.0f;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
float tempEnergy = 0.0f;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS
&& atom1 != atom2
) {
COMPUTE_INTERACTION
COMPUTE_INTERACTION
dEdR /= -r;
dEdR /= -r;
}
}
energy
+=
tempEnergy
;
energy +=
0.5f*
tempEnergy;
delta.xyz *= dEdR;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
force.xyz -= delta.xyz;
atom2
=
tbx+tj
;
local_force[atom2].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
}
}
#endif
#endif
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
excl >>= 1;
excl >>= 1;
#endif
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
}
}
}
}
else {
// This is an off-diagonal tile.
//
Write
results
const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_force[localAtomIndex] = 0.0f;
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags == 0) {
// No interactions in this tile.
}
else
#endif
{
// Compute the full set of interactions in this tile.
reserveBuffer
(
x,
forceBufferFlags
)
;
#ifdef USE_EXCLUSIONS
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
forceBuffers[offset1].xyz
+=
force.xyz
;
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
STORE_DERIVATIVES_1
#
endif
releaseBuffer
(
x,
forceBufferFlags
)
;
unsigned
int
tj
=
tgx
;
reserveBuffer
(
y,
forceBufferFlags
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
ifdef
USE_EXCLUSIONS
forceBuffers[offset2].xyz
+=
local_force[get_local_id
(
0
)
].xyz
;
bool
isExcluded
=
!
(
excl
&
0x1
)
;
STORE_DERIVATIVES_2
#
endif
releaseBuffer
(
y,
forceBufferFlags
)
;
int
atom2
=
tbx+tj
;
float4
posq2
=
local_posq[atom2]
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
float
r
=
SQRT
(
r2
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+tj
;
float
dEdR
=
0.0f
;
float
tempEnergy
=
0.0f
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-r
;
}
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
atom2
=
tbx+tj
;
local_force[atom2].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
#
ifdef
USE_CUTOFF
}
#
endif
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
}
}
}
}
}
lasty
=
y
;
lasty
=
y
;
//
Write
results.
We
need
to
coordinate
between
warps
to
make
sure
no
two
of
them
//
ever
try
to
write
to
the
same
piece
of
memory
at
the
same
time.
int
writeX
=
(
pos
<
end
?
x
:
-1
)
;
int
writeY
=
(
pos
<
end
&&
x
!=
y
?
y
:
-1
)
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
writeX,
writeY
)
;
bool
done
=
false
;
int
doneIndex
=
0
;
int
checkIndex
=
0
;
while
(
true
)
{
//
See
if
any
warp
still
needs
to
write
its
data.
bool
allDone
=
true
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
while
(
doneIndex
<
WARPS_PER_GROUP
&&
allDone
)
{
if
(
reservedBlocks[doneIndex].x
!=
-1
)
allDone
=
false
;
else
doneIndex++
;
}
if
(
allDone
)
break
;
if
(
!done
)
{
//
See
whether
this
warp
can
write
its
data.
This
requires
that
no
previous
warp
//
is
trying
to
write
to
the
same
block
of
the
buffer.
bool
canWrite
=
(
writeX
!=
-1
)
;
while
(
checkIndex
<
localGroupIndex
&&
canWrite
)
{
if
((
reservedBlocks[checkIndex].x
==
x
|
| reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y |
|
reservedBlocks[checkIndex].y
==
y
)))
canWrite
=
false
;
else
checkIndex++
;
}
if
(
canWrite
)
{
//
Write
the
data
to
global
memory,
then
mark
this
warp
as
done.
if
(
writeX
>
-1
)
{
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
force.xyz
;
STORE_DERIVATIVES_1
}
if
(
writeY
>
-1
)
{
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
local_force[get_local_id
(
0
)
].xyz
;
STORE_DERIVATIVES_2
}
done
=
true
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
-1
,
-1
)
;
}
}
}
pos++
;
pos++
;
}
}
while
(
pos
<
end
)
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
}
platforms/opencl/src/kernels/customGBValueN2_cpu.cl
View file @
7943a339
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
__kernel
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__kernel
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__global
float*
global_value,
__local
float*
local_value,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__global
float*
global_value,
__local
float*
local_value,
__local
float*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
#
else
...
...
platforms/opencl/src/kernels/customGBValueN2_default.cl
View file @
7943a339
...
@@ -7,7 +7,7 @@
...
@@ -7,7 +7,7 @@
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__global
float*
global_value,
__local
float*
local_value,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__global
float*
global_value,
__local
float*
local_value,
__local
float*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
#
else
#
else
...
...
platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
View file @
7943a339
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
define
TILE_SIZE
32
#
define
TILE_SIZE
32
/**
*
Mark
that
a
block
in
the
value
buffer
is
in
use.
*/
void
reserveBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
while
(
atom_cmpxchg
(
&forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
],
0
,
1
)
!=
0
)
;
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
}
/**
*
Mark
that
a
block
in
the
value
buffer
is
no
longer
in
use.
*/
void
releaseBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
]
=
0
;
}
/**
/**
*
Compute
a
value
based
on
pair
interactions.
*
Compute
a
value
based
on
pair
interactions.
*/
*/
__kernel
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__kernel
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__global
float*
global_value,
__local
float*
local_value,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__global
float*
global_value,
__local
float*
local_value,
__local
float*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
#
else
...
@@ -46,222 +27,262 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
...
@@ -46,222 +27,262 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
while
(
pos
<
end
)
{
do
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
unsigned
int
x,
y
;
unsigned
int
x,
y
;
float
value
=
0.0f
;
if
(
pos
<
end
)
{
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
y
=
tileIndices.y
;
}
}
else
else
#
endif
#
endif
{
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
float4 posq1 = posq[atom1];
const unsigned int tbx = get_local_id(0) - tgx;
LOAD_ATOM1_PARAMETERS
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx;
float value = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
if (tgx < 2)
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
#else
bool hasExclusions = false;
bool hasExclusions = false;
#endif
#endif
if (x == y) {
if (pos >= end)
// This tile is on the diagonal.
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
bool isExcluded = !(excl & 0x1);
#endif
#endif
int atom2 = tbx+j;
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
if (r2 < CUTOFF_SQUARED) {
#endif
#endif
float r = SQRT(r2);
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
atom2 = y*TILE_SIZE+j;
float tempValue1 = 0.0f;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
float tempValue2 = 0.0f;
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#else
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
#endif
COMPUTE_VALUE
COMPUTE_VALUE
}
}
value += tempValue1;
value += tempValue1;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
}
}
#endif
#endif
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
excl >>= 1;
excl >>= 1;
#endif
#endif
}
}
}
else {
// This is an off-diagonal tile.
// Write results
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
reserveBuffer(x, forceBufferFlags);
local_posq[get_local_id(0)] = posq[j];
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int localAtomIndex = get_local_id(0);
global_value[offset] += value;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
releaseBuffer(x, forceBufferFlags);
}
else {
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j];
const unsigned int localAtomIndex = get_local_id(0);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_value[get_local_id(0)] = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
}
else {
local_value[get_local_id(0)] = 0.0f;
// Compute only a subset of the interactions in this tile.
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float tempValue1 = 0.0f;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
float tempValue2 = 0.0f;
if (r2 < CUTOFF_SQUARED) {
if (r2 < CUTOFF_SQUARED) {
float r = SQRT(r2);
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_VALUE
COMPUTE_VALUE
}
value += tempValue1;
}
}
value += tempValue1;
tempBuffer[get_local_id(0)] = tempValue2;
}
tempBuffer[get_local_id(0)] = tempValue2;
// Sum the forces on atom2.
// Sum the forces on atom2.
if (tgx % 2 == 0)
if (tgx % 2 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
if (tgx % 4 == 0)
if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
if (tgx % 8 == 0)
if (tgx % 8 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
if (tgx % 16 == 0)
if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
if (tgx == 0)
if (tgx == 0)
local_value[tbx+j] += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
local_value[tbx+j] += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
}
}
}
}
}
}
}
}
else
else
#endif
#endif
{
{
// Compute the full set of interactions in this tile.
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
#
endif
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
#
endif
int
atom2
=
tbx+tj
;
int
atom2
=
tbx+tj
;
float4
posq2
=
local_posq[atom2]
;
float4
posq2
=
local_posq[atom2]
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
#
endif
float
r
=
SQRT
(
r2
)
;
float
r
=
SQRT
(
r2
)
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+tj
;
atom2
=
y*TILE_SIZE+tj
;
float
tempValue1
=
0.0f
;
float
tempValue1
=
0.0f
;
float
tempValue2
=
0.0f
;
float
tempValue2
=
0.0f
;
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
if
(
!isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if
(
!isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#
else
#
else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#
endif
#
endif
COMPUTE_VALUE
COMPUTE_VALUE
}
}
value
+=
tempValue1
;
value
+=
tempValue1
;
local_value[tbx+tj]
+=
tempValue2
;
local_value[tbx+tj]
+=
tempValue2
;
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
}
}
#
endif
#
endif
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
excl
>>=
1
;
#
endif
#
endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
}
}
}
}
}
}
//
Write
results.
We
need
to
coordinate
between
warps
to
make
sure
no
two
of
them
//
ever
try
to
write
to
the
same
piece
of
memory
at
the
same
time.
int
writeX
=
(
pos
<
end
?
x
:
-1
)
;
int
writeY
=
(
pos
<
end
&&
x
!=
y
?
y
:
-1
)
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
writeX,
writeY
)
;
bool
done
=
false
;
int
doneIndex
=
0
;
int
checkIndex
=
0
;
while
(
true
)
{
//
See
if
any
warp
still
needs
to
write
its
data.
//
Write
results
bool
allDone
=
true
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
while
(
doneIndex
<
WARPS_PER_GROUP
&&
allDone
)
{
if
(
reservedBlocks[doneIndex].x
!=
-1
)
allDone
=
false
;
else
doneIndex++
;
}
if
(
allDone
)
break
;
if
(
!done
)
{
//
See
whether
this
warp
can
write
its
data.
This
requires
that
no
previous
warp
//
is
trying
to
write
to
the
same
block
of
the
buffer.
reserveBuffer
(
x,
forceBufferFlags
)
;
bool
canWrite
=
(
writeX
!=
-1
)
;
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
while
(
checkIndex
<
localGroupIndex
&&
canWrite
)
{
global_value[offset1]
+=
value
;
if
((
reservedBlocks[checkIndex].x
==
x
|
| reservedBlocks[checkIndex].y == x) ||
releaseBuffer
(
x,
forceBufferFlags
)
;
(writeY != -1 && (reservedBlocks[checkIndex].x == y |
|
reservedBlocks[checkIndex].y
==
y
)))
reserveBuffer
(
y,
forceBufferFlags
)
;
canWrite
=
false
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
else
global_value[offset2]
+=
local_value[get_local_id
(
0
)
]
;
checkIndex++
;
releaseBuffer
(
y,
forceBufferFlags
)
;
}
if
(
canWrite
)
{
//
Write
the
data
to
global
memory,
then
mark
this
warp
as
done.
if
(
writeX
>
-1
)
{
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset]
+=
value
;
}
if
(
writeY
>
-1
)
{
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset]
+=
local_value[get_local_id
(
0
)
]
;
}
done
=
true
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
-1
,
-1
)
;
}
}
}
}
lasty
=
y
;
lasty
=
y
;
pos++
;
pos++
;
}
}
while
(
pos
<
end
)
;
}
}
platforms/opencl/src/kernels/gbsaObc_cpu.cl
View file @
7943a339
...
@@ -15,7 +15,7 @@ typedef struct {
...
@@ -15,7 +15,7 @@ typedef struct {
*/
*/
__kernel
void
computeBornSum
(
__global
float*
global_bornSum,
__global
float4*
posq,
__global
float2*
global_params,
__kernel
void
computeBornSum
(
__global
float*
global_bornSum,
__global
float4*
posq,
__global
float2*
global_params,
__local
AtomData*
localData,
__local
float*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__local
AtomData*
localData,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
)
{
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
)
{
#
else
#
else
...
@@ -192,7 +192,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
...
@@ -192,7 +192,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__local AtomData* localData, __local float4* tempBuffer,
__global unsigned int* forceBufferFlags,
__local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
#else
#else
...
...
platforms/opencl/src/kernels/gbsaObc_default.cl
View file @
7943a339
...
@@ -16,7 +16,7 @@ typedef struct {
...
@@ -16,7 +16,7 @@ typedef struct {
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeBornSum
(
__global
float*
global_bornSum,
__global
float4*
posq,
__global
float2*
global_params,
void
computeBornSum
(
__global
float*
global_bornSum,
__global
float4*
posq,
__global
float2*
global_params,
__local
AtomData*
localData,
__local
float*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__local
AtomData*
localData,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
)
{
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
)
{
#
else
#
else
...
@@ -203,7 +203,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -203,7 +203,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__local AtomData* localData, __local float4* tempBuffer,
__global unsigned int* forceBufferFlags,
__local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else
#else
...
...
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
View file @
7943a339
...
@@ -11,30 +11,11 @@ typedef struct {
...
@@ -11,30 +11,11 @@ typedef struct {
float
bornForce
;
float
bornForce
;
}
AtomData
;
}
AtomData
;
/**
*
Mark
that
a
block
in
the
force
buffer
is
in
use.
*/
void
reserveBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
while
(
atom_cmpxchg
(
&forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
],
0
,
1
)
!=
0
)
;
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
}
/**
*
Mark
that
a
block
in
the
force
buffer
is
no
longer
in
use.
*/
void
releaseBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
]
=
0
;
}
/**
/**
*
Compute
the
Born
sum.
*
Compute
the
Born
sum.
*/
*/
__kernel
void
computeBornSum
(
__global
float*
global_bornSum,
__global
float4*
posq,
__global
float2*
global_params,
__kernel
void
computeBornSum
(
__global
float*
global_bornSum,
__global
float4*
posq,
__global
float2*
global_params,
__local
AtomData*
localData,
__local
float*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__local
AtomData*
localData,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
)
{
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
)
{
#
else
#
else
...
@@ -51,231 +32,271 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
...
@@ -51,231 +32,271 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
#
endif
#
endif
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
int2
reservedBlocks[WARPS_PER_GROUP]
;
while
(
pos
<
end
)
{
do
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
unsigned
int
x,
y
;
unsigned
int
x,
y
;
float
bornSum
=
0.0f
;
if
(
pos
<
end
)
{
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
y
=
tileIndices.y
;
}
}
else
else
#
endif
#
endif
{
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
float4 posq1 = posq[atom1];
const unsigned int tbx = get_local_id(0) - tgx;
float2 params1 = global_params[atom1];
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
if (pos >= end)
unsigned int atom1 = x*TILE_SIZE + tgx;
; // This warp is done.
float bornSum = 0.0f;
else if (x == y) {
float4 posq1 = posq[atom1];
// This tile is on the diagonal.
float2 params1 = global_params[atom1];
if (x == y) {
// This tile is on the diagonal.
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
localData[get_local_id(0)].scaledRadius = params1.y;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
#endif
float invR = RSQRT(r2);
float invR = RSQRT(r2);
float r = RECIP(invR);
float r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float rScaledRadiusJ = r+params2.y;
float rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
float ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r)
if (params1.x < params2.x-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
}
}
}
}
}
}
else {
// This is an off-diagonal tile.
// Write results
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
reserveBuffer(x, forceBufferFlags);
float4 tempPosq = posq[j];
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
localData[get_local_id(0)].x = tempPosq.x;
global_bornSum[offset] += bornSum;
localData[get_local_id(0)].y = tempPosq.y;
releaseBuffer(x, forceBufferFlags);
localData[get_local_id(0)].z = tempPosq.z;
}
localData[get_local_id(0)].q = tempPosq.w;
else {
float2 tempParams = global_params[j];
// This is an off-diagonal tile.
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
}
localData[get_local_id(0)].bornSum = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
if (flags == 0) {
// No interactions in this tile.
}
}
else {
localData[get_local_id(0)].bornSum = 0.0f;
// Compute only a subset of the interactions in this tile.
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
if ((flags&(1<<j)) != 0) {
float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
tempBuffer[get_local_id(0)] = 0.0f;
tempBuffer[get_local_id(0)] = 0.0f;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
#endif
float invR = RSQRT(r2);
float invR = RSQRT(r2);
float r = RECIP(invR);
float r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float rScaledRadiusJ = r+params2.y;
float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
if (params1.x < rScaledRadiusJ) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
float ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r)
if (params1.x < params2.x-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
float rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
tempBuffer[get_local_id(0)] = term;
}
}
}
float rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
tempBuffer[get_local_id(0)] = term;
}
}
// Sum the forces on atom j.
// Sum the forces on atom j.
if (tgx % 2 == 0)
if (tgx % 2 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
if (tgx % 4 == 0)
if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
if (tgx % 8 == 0)
if (tgx % 8 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
if (tgx % 16 == 0)
if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
if (tgx == 0)
if (tgx == 0)
localData[tbx+j].bornSum += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
localData[tbx+j].bornSum += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
}
}
}
}
}
}
}
}
else
else
#endif
#endif
{
{
// Compute the full set of interactions in this tile.
// Compute the full set of interactions in this tile.
unsigned int tj = tgx;
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0.0f);
float4 delta = (float4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
#endif
#endif
float invR = RSQRT(r2);
float invR = RSQRT(r2);
float r = RECIP(invR);
float r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
float rScaledRadiusJ = r+params2.y;
float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
if (params1.x < rScaledRadiusJ) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
float ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r)
if (params1.x < params2.x-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
}
float rScaledRadiusI = r+params1.y;
float rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
if (params2.x < rScaledRadiusI) {
float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI);
float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij;
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
float ratio = LOG(u_ij * RECIP(l_ij));
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r)
if (params2.x < params1.x-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
term += 2.0f*(RECIP(params2.x)-l_ij);
localData[tbx+tj].bornSum += term;
localData[tbx+tj].bornSum += term;
}
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
}
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
bool done = false;
int doneIndex = 0;
int checkIndex = 0;
while (true) {
// See if any warp still needs to write its data.
bool allDone = true;
barrier(CLK_LOCAL_MEM_FENCE);
while (doneIndex < WARPS_PER_GROUP && allDone) {
if (reservedBlocks[doneIndex].x != -1)
allDone = false;
else
doneIndex++;
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
// Write results
bool canWrite = (writeX != -1);
while (checkIndex < localGroupIndex && canWrite) {
if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
canWrite = false;
else
checkIndex++;
}
if (canWrite) {
// Write the data to global memory, then mark this warp as done.
reserveBuffer(x, forceBufferFlags);
if (writeX > -1) {
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset1] += bornSum;
global_bornSum[offset] += bornSum;
releaseBuffer(x, forceBufferFlags);
}
reserveBuffer(y, forceBufferFlags);
if (writeY > -1) {
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset2] += localData[get_local_id(0)].bornSum;
global_bornSum[offset] += localData[get_local_id(0)].bornSum;
releaseBuffer(y, forceBufferFlags);
}
done = true;
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(-1, -1);
}
}
}
}
lasty = y;
lasty = y;
pos++;
pos++;
}
}
while (pos < end);
}
}
/**
/**
...
@@ -284,7 +305,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
...
@@ -284,7 +305,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__local AtomData* localData, __local float4* tempBuffer,
__global unsigned int* forceBufferFlags,
__local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
#else
#else
...
@@ -302,113 +323,171 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
...
@@ -302,113 +323,171 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
#endif
#endif
float energy = 0.0f;
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
unsigned int lasty = 0xFFFFFFFF;
__local int2 reservedBlocks[WARPS_PER_GROUP];
while (pos < end) {
do {
// Extract the coordinates of this tile
// Extract the coordinates of this tile
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y;
unsigned int x, y;
float4 force = 0.0f;
if (pos < end) {
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
x = tileIndices.x;
y = tileIndices.y;
y = tileIndices.y;
}
}
else
else
#endif
#endif
{
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
float4 posq1 = posq[atom1];
const unsigned int tbx = get_local_id(0) - tgx;
float bornRadius1 = global_bornRadii[atom1];
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
if (x == y) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// This tile is on the diagonal.
float4 force = 0.0f;
float4 posq1 = posq[atom1];
float bornRadius1 = global_bornRadii[atom1];
if (x == y) {
// This tile is on the diagonal.
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1;
localData[get_local_id(0)].bornRadius = bornRadius1;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float invR = RSQRT(r2);
float r = RECIP(invR);
float r = RECIP(invR);
float bornRadius2 = localData[tbx+j].bornRadius;
float bornRadius2 = localData[tbx+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij);
float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2);
float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2);
float Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if (r2 > CUTOFF_SQUARED) {
if (r2 > CUTOFF_SQUARED) {
dEdR = 0.0f;
dEdR = 0.0f;
tempEnergy = 0.0f;
tempEnergy = 0.0f;
dGpol_dalpha2_ij = 0.0f;
dGpol_dalpha2_ij = 0.0f;
}
}
#endif
#endif
force.w += dGpol_dalpha2_ij*bornRadius2;
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += 0.5f*tempEnergy;
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
force.xyz -= delta.xyz;
}
}
}
}
}
else {
// This is an off-diagonal tile.
// Write results
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
reserveBuffer(x, forceBufferFlags);
float4 tempPosq = posq[j];
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
localData[get_local_id(0)].x = tempPosq.x;
forceBuffers[offset].xyz += force.xyz;
localData[get_local_id(0)].y = tempPosq.y;
global_bornForce[offset] += force.w;
localData[get_local_id(0)].z = tempPosq.z;
releaseBuffer(x, forceBufferFlags);
localData[get_local_id(0)].q = tempPosq.w;
}
localData[get_local_id(0)].bornRadius = global_bornRadii[j];
else {
}
// This is an off-diagonal tile.
localData[get_local_id(0)].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
if (lasty != y) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
unsigned int j = y*TILE_SIZE + tgx;
if ((flags&(1<<j)) != 0) {
float4 tempPosq = posq[j];
float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
localData[get_local_id(0)].x = tempPosq.x;
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
localData[get_local_id(0)].y = tempPosq.y;
#ifdef USE_PERIODIC
localData[get_local_id(0)].z = tempPosq.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[get_local_id(0)].q = tempPosq.w;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[get_local_id(0)].bornRadius = global_bornRadii[j];
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
}
#endif
localData[get_local_id(0)].fx = 0.0f;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
localData[get_local_id(0)].fy = 0.0f;
float invR = RSQRT(r2);
localData[get_local_id(0)].fz = 0.0f;
float r = RECIP(invR);
localData[get_local_id(0)].fw = 0.0f;
float bornRadius2 = localData[tbx+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
#else
if (flags == 0) {
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS) {
// No interactions in this tile.
#endif
dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f;
tempEnergy = 0.0f;
}
energy += tempEnergy;
force.w += dGpol_dalpha2_ij*bornRadius2;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
tempBuffer[get_local_id(0)] = (float4) (delta.xyz, dGpol_dalpha2_ij*bornRadius1);
// Sum the forces on atom j.
if (tgx % 2 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
if (tgx % 8 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
if (tgx == 0) {
float4 sum = tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
localData[tbx+j].fx += sum.x;
localData[tbx+j].fy += sum.y;
localData[tbx+j].fz += sum.z;
localData[tbx+j].fw += sum.w;
}
}
}
}
}
}
else {
else
// Compute only a subset of the interactions in this tile.
#endif
{
// Compute the full set of interactions in this tile.
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (
(flags&(1<<j)) != 0
) {
if (
atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS
) {
float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
float4 posq2 = (float4) (localData[tbx+
t
j].x, localData[tbx+
t
j].y, localData[tbx+
t
j].z, localData[tbx+
t
j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
...
@@ -418,7 +497,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
...
@@ -418,7 +497,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float invR = RSQRT(r2);
float r = RECIP(invR);
float r = RECIP(invR);
float bornRadius2 = localData[tbx+j].bornRadius;
float bornRadius2 = localData[tbx+
t
j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij);
float expTerm = EXP(-D_ij);
...
@@ -429,104 +508,83 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
...
@@ -429,104 +508,83 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
if (r2 > CUTOFF_SQUARED) {
#else
if (atom1 >= NUM_ATOMS |
|
y*TILE_SIZE+j
>=
NUM_ATOMS
)
{
#
endif
dEdR = 0.0f;
dEdR = 0.0f;
tempEnergy = 0.0f;
dGpol_dalpha2_ij = 0.0f;
dGpol_dalpha2_ij = 0.0f;
tempEnergy
=
0.0f
;
}
}
energy
+=
tempEnergy
;
#endif
force.w += dGpol_dalpha2_ij*bornRadius2;
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += tempEnergy;
delta.xyz *= dEdR;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
force.xyz -= delta.xyz;
tempBuffer[get_local_id
(
0
)
]
=
(
float4
)
(
delta.xyz,
dGpol_dalpha2_ij*bornRadius1
)
;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
//
Sum
the
forces
on
atom
j.
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
if
(
tgx
%
2
==
0
)
tempBuffer[get_local_id
(
0
)
]
+=
tempBuffer[get_local_id
(
0
)
+1]
;
if
(
tgx
%
4
==
0
)
tempBuffer[get_local_id
(
0
)
]
+=
tempBuffer[get_local_id
(
0
)
+2]
;
if
(
tgx
%
8
==
0
)
tempBuffer[get_local_id
(
0
)
]
+=
tempBuffer[get_local_id
(
0
)
+4]
;
if
(
tgx
%
16
==
0
)
tempBuffer[get_local_id
(
0
)
]
+=
tempBuffer[get_local_id
(
0
)
+8]
;
if
(
tgx
==
0
)
{
float4
sum
=
tempBuffer[get_local_id
(
0
)
]
+
tempBuffer[get_local_id
(
0
)
+16]
;
localData[tbx+j].fx
+=
sum.x
;
localData[tbx+j].fy
+=
sum.y
;
localData[tbx+j].fz
+=
sum.z
;
localData[tbx+j].fw
+=
sum.w
;
}
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
}
}
else
}
#
endif
{
// Write results. We need to coordinate between warps to make sure no two of them
//
Compute
the
full
set
of
interactions
in
this
tile.
// ever try to write to the same piece of memory at the same time.
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
bool done = false;
int doneIndex = 0;
int checkIndex = 0;
while (true) {
// See if any warp still needs to write its data.
unsigned
int
tj
=
tgx
;
bool allDone = true;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
barrier(CLK_LOCAL_MEM_FENCE);
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+tj
<
NUM_ATOMS
)
{
while (doneIndex < WARPS_PER_GROUP && allDone) {
float4
posq2
=
(
float4
)
(
localData[tbx+tj].x,
localData[tbx+tj].y,
localData[tbx+tj].z,
localData[tbx+tj].q
)
;
if (reservedBlocks[doneIndex].x != -1)
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
allDone = false;
#
ifdef
USE_PERIODIC
else
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
doneIndex++;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
float
bornRadius2
=
localData[tbx+tj].bornRadius
;
float
alpha2_ij
=
bornRadius1*bornRadius2
;
float
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
float
expTerm
=
EXP
(
-D_ij
)
;
float
denominator2
=
r2
+
alpha2_ij*expTerm
;
float
denominator
=
SQRT
(
denominator2
)
;
float
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
*RECIP
(
denominator
)
;
float
Gpol
=
tempEnergy*RECIP
(
denominator2
)
;
float
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
float
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
#
ifdef
USE_CUTOFF
if
(
r2
>
CUTOFF_SQUARED
)
{
dEdR
=
0.0f
;
tempEnergy
=
0.0f
;
dGpol_dalpha2_ij
=
0.0f
;
}
#
endif
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
localData[tbx+tj].fx
+=
delta.x
;
localData[tbx+tj].fy
+=
delta.y
;
localData[tbx+tj].fz
+=
delta.z
;
localData[tbx+tj].fw
+=
dGpol_dalpha2_ij*bornRadius1
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
}
}
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
//
Write
results
bool canWrite = (writeX != -1);
while (checkIndex < localGroupIndex && canWrite) {
if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y |
|
reservedBlocks[checkIndex].y
==
y
)
))
canWrite
=
false
;
else
checkIndex++
;
}
if
(
canWrite
)
{
//
Write
the
data
to
global
memory,
then
mark
this
warp
as
done.
reserveBuffer
(
x,
forceBufferFlags
)
;
if
(
writeX
>
-1
)
{
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset1].xyz
+=
force.xyz
;
forceBuffers[offset].xyz
+=
force.xyz
;
global_bornForce[offset1]
+=
force.w
;
global_bornForce[offset]
+=
force.w
;
releaseBuffer
(
x,
forceBufferFlags
)
;
}
reserveBuffer
(
y,
forceBufferFlags
)
;
if
(
writeY
>
-1
)
{
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset2]
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0
)
;
forceBuffers[offset]
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0.0f
)
;
global_bornForce[offset2]
+=
localData[get_local_id
(
0
)
].fw
;
global_bornForce[offset]
+=
localData[get_local_id
(
0
)
].fw
;
releaseBuffer
(
y,
forceBufferFlags
)
;
}
done
=
true
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
-1
,
-1
)
;
}
}
}
}
lasty
=
y
;
lasty
=
y
;
pos++
;
pos++
;
}
}
while
(
pos
<
end
)
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
}
platforms/opencl/src/kernels/nonbonded_cpu.cl
View file @
7943a339
...
@@ -13,7 +13,7 @@ typedef struct {
...
@@ -13,7 +13,7 @@ typedef struct {
__kernel
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__kernel
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
__global
unsigned
int*
forceBufferFlags,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
#
else
...
...
platforms/opencl/src/kernels/nonbonded_default.cl
View file @
7943a339
...
@@ -14,7 +14,7 @@ typedef struct {
...
@@ -14,7 +14,7 @@ typedef struct {
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
__global
unsigned
int*
forceBufferFlags,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
#
else
...
...
platforms/opencl/src/kernels/nonbonded_nvidia.cl
View file @
7943a339
...
@@ -8,31 +8,12 @@ typedef struct {
...
@@ -8,31 +8,12 @@ typedef struct {
ATOM_PARAMETER_DATA
ATOM_PARAMETER_DATA
}
AtomData
;
}
AtomData
;
/**
*
Mark
that
a
block
in
the
force
buffer
is
in
use.
*/
void
reserveBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
while
(
atom_cmpxchg
(
&forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
],
0
,
1
)
!=
0
)
;
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
}
/**
*
Mark
that
a
block
in
the
force
buffer
is
no
longer
in
use.
*/
void
releaseBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
]
=
0
;
}
/**
/**
*
Compute
nonbonded
interactions.
*
Compute
nonbonded
interactions.
*/
*/
__kernel
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__kernel
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float*
tempBuffer,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float*
tempBuffer,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
__global
unsigned
int*
forceBufferFlags,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
#
else
...
@@ -53,264 +34,306 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
...
@@ -53,264 +34,306 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
while
(
pos
<
end
)
{
do
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
unsigned
int
x,
y
;
unsigned
int
x,
y
;
float4
force
=
0.0f
;
if
(
pos
<
end
)
{
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
y
=
tileIndices.y
;
}
}
else
else
#
endif
#
endif
{
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
float4 posq1 = posq[atom1];
const unsigned int tbx = get_local_id(0) - tgx;
LOAD_ATOM1_PARAMETERS
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
if (tgx < 2)
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
#else
bool hasExclusions = false;
bool hasExclusions = false;
#endif
#endif
if (x == y) {
if (pos >= end)
// This tile is on the diagonal.
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
const unsigned int localAtomIndex = get_local_id(0);
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w;
localData[localAtomIndex].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
bool isExcluded = !(excl & 0x1);
#endif
#endif
int atom2 = tbx+j;
int atom2 = tbx+j;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float invR = RSQRT(r2);
float r = RECIP(invR);
float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
float dEdR = 0.0f;
#else
#else
float4 dEdR1 = (float4) 0.0f;
float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
#endif
#endif
float tempEnergy = 0.0f;
float tempEnergy = 0.0f;
COMPUTE_INTERACTION
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC
#ifdef USE_SYMMETRIC
force.xyz -= delta.xyz*dEdR;
force.xyz -= delta.xyz*dEdR;
#else
#else
force.xyz -= dEdR1.xyz;
force.xyz -= dEdR1.xyz;
#endif
#endif
excl >>= 1;
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
}
}
else {
// This is an off-diagonal tile.
// Write results
const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y) {
reserveBuffer(x, forceBufferFlags);
unsigned int j = y*TILE_SIZE + tgx;
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
float4 tempPosq = posq[j];
forceBuffers[offset].xyz += force.xyz;
localData[localAtomIndex].x = tempPosq.x;
releaseBuffer(x, forceBufferFlags);
localData[localAtomIndex].y = tempPosq.y;
}
localData[localAtomIndex].z = tempPosq.z;
else {
localData[localAtomIndex].q = tempPosq.w;
// This is an off-diagonal tile.
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData[localAtomIndex].fx = 0.0f;
localData[localAtomIndex].fy = 0.0f;
localData[localAtomIndex].fz = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
}
else {
localData[localAtomIndex].fx = 0.0f;
// Compute only a subset of the interactions in this tile.
localData[localAtomIndex].fy = 0.0f;
localData[localAtomIndex].fz = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
if ((flags&(1<<j)) != 0) {
bool isExcluded = false;
bool isExcluded = false;
int atom2 = tbx+j;
int atom2 = tbx+j;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float invR = RSQRT(r2);
float r = RECIP(invR);
float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
float dEdR = 0.0f;
#else
#else
float4 dEdR1 = (float4) 0.0f;
float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
#endif
#endif
float tempEnergy = 0.0f;
float tempEnergy = 0.0f;
COMPUTE_INTERACTION
COMPUTE_INTERACTION
energy += tempEnergy;
energy += tempEnergy;
int bufferIndex = 3*get_local_id(0);
int bufferIndex = 3*get_local_id(0);
#ifdef USE_SYMMETRIC
#ifdef USE_SYMMETRIC
delta.xyz *= dEdR;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
force.xyz -= delta.xyz;
tempBuffer[bufferIndex] = delta.x;
tempBuffer[bufferIndex] = delta.x;
tempBuffer[bufferIndex+1] = delta.y;
tempBuffer[bufferIndex+1] = delta.y;
tempBuffer[bufferIndex+2] = delta.z;
tempBuffer[bufferIndex+2] = delta.z;
#else
#else
force.xyz -= dEdR1.xyz;
force.xyz -= dEdR1.xyz;
tempBuffer[bufferIndex] = dEdR2.x;
tempBuffer[bufferIndex] = dEdR2.x;
tempBuffer[bufferIndex+1] = dEdR2.y;
tempBuffer[bufferIndex+1] = dEdR2.y;
tempBuffer[bufferIndex+2] = dEdR2.z;
tempBuffer[bufferIndex+2] = dEdR2.z;
#endif
#endif
// Sum the forces on atom2.
// Sum the forces on atom2.
if (tgx % 2 == 0) {
if (tgx % 2 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3];
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5];
}
}
if (tgx % 4 == 0) {
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+6];
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+6];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+7];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+7];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+8];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+8];
}
}
if (tgx % 8 == 0) {
if (tgx % 8 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+12];
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+12];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+13];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+13];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+14];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+14];
}
}
if (tgx % 16 == 0) {
if (tgx % 16 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+24];
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+24];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+25];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+25];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+26];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+26];
}
}
if (tgx == 0) {
if (tgx == 0) {
localData[tbx+j].fx += tempBuffer[bufferIndex] + tempBuffer[bufferIndex+48];
localData[tbx+j].fx += tempBuffer[bufferIndex] + tempBuffer[bufferIndex+48];
localData[tbx+j].fy += tempBuffer[bufferIndex+1] + tempBuffer[bufferIndex+49];
localData[tbx+j].fy += tempBuffer[bufferIndex+1] + tempBuffer[bufferIndex+49];
localData[tbx+j].fz += tempBuffer[bufferIndex+2] + tempBuffer[bufferIndex+50];
localData[tbx+j].fz += tempBuffer[bufferIndex+2] + tempBuffer[bufferIndex+50];
}
}
}
}
}
}
}
}
}
}
else
else
#endif
#endif
{
{
// Compute the full set of interactions in this tile.
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
#
endif
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
#
endif
int
atom2
=
tbx+tj
;
int
atom2
=
tbx+tj
;
float4
posq2
=
(
float4
)
(
localData[atom2].x,
localData[atom2].y,
localData[atom2].z,
localData[atom2].q
)
;
float4
posq2
=
(
float4
)
(
localData[atom2].x,
localData[atom2].y,
localData[atom2].z,
localData[atom2].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
invR
=
RSQRT
(
r2
)
;
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
float
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+tj
;
atom2
=
y*TILE_SIZE+tj
;
#
ifdef
USE_SYMMETRIC
#
ifdef
USE_SYMMETRIC
float
dEdR
=
0.0f
;
float
dEdR
=
0.0f
;
#
else
#
else
float4
dEdR1
=
(
float4
)
0.0f
;
float4
dEdR1
=
(
float4
)
0.0f
;
float4
dEdR2
=
(
float4
)
0.0f
;
float4
dEdR2
=
(
float4
)
0.0f
;
#
endif
#
endif
float
tempEnergy
=
0.0f
;
float
tempEnergy
=
0.0f
;
COMPUTE_INTERACTION
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
energy
+=
tempEnergy
;
#
ifdef
USE_SYMMETRIC
#
ifdef
USE_SYMMETRIC
delta.xyz
*=
dEdR
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
force.xyz
-=
delta.xyz
;
localData[tbx+tj].fx
+=
delta.x
;
localData[tbx+tj].fx
+=
delta.x
;
localData[tbx+tj].fy
+=
delta.y
;
localData[tbx+tj].fy
+=
delta.y
;
localData[tbx+tj].fz
+=
delta.z
;
localData[tbx+tj].fz
+=
delta.z
;
#
else
#
else
force.xyz
-=
dEdR1.xyz
;
force.xyz
-=
dEdR1.xyz
;
localData[tbx+tj].fx
+=
dEdR2.x
;
localData[tbx+tj].fx
+=
dEdR2.x
;
localData[tbx+tj].fy
+=
dEdR2.y
;
localData[tbx+tj].fy
+=
dEdR2.y
;
localData[tbx+tj].fz
+=
dEdR2.z
;
localData[tbx+tj].fz
+=
dEdR2.z
;
#
endif
#
endif
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
excl
>>=
1
;
#
endif
#
endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
}
}
}
}
}
}
//
Write
results.
We
need
to
coordinate
between
warps
to
make
sure
no
two
of
them
//
ever
try
to
write
to
the
same
piece
of
memory
at
the
same
time.
int
writeX
=
(
pos
<
end
?
x
:
-1
)
;
int
writeY
=
(
pos
<
end
&&
x
!=
y
?
y
:
-1
)
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
writeX,
writeY
)
;
bool
done
=
false
;
int
doneIndex
=
0
;
int
checkIndex
=
0
;
while
(
true
)
{
//
See
if
any
warp
still
needs
to
write
its
data.
//
Write
results
bool
allDone
=
true
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
while
(
doneIndex
<
WARPS_PER_GROUP
&&
allDone
)
{
if
(
reservedBlocks[doneIndex].x
!=
-1
)
allDone
=
false
;
else
doneIndex++
;
}
if
(
allDone
)
break
;
if
(
!done
)
{
//
See
whether
this
warp
can
write
its
data.
This
requires
that
no
previous
warp
//
is
trying
to
write
to
the
same
block
of
the
buffer.
reserveBuffer
(
x,
forceBufferFlags
)
;
bool
canWrite
=
(
writeX
!=
-1
)
;
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
while
(
checkIndex
<
localGroupIndex
&&
canWrite
)
{
forceBuffers[offset1].xyz
+=
force.xyz
;
if
((
reservedBlocks[checkIndex].x
==
x
|
| reservedBlocks[checkIndex].y == x) ||
releaseBuffer
(
x,
forceBufferFlags
)
;
(writeY != -1 && (reservedBlocks[checkIndex].x == y |
|
reservedBlocks[checkIndex].y
==
y
)))
reserveBuffer
(
y,
forceBufferFlags
)
;
canWrite
=
false
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
else
forceBuffers[offset2]
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0.0f
)
;
checkIndex++
;
releaseBuffer
(
y,
forceBufferFlags
)
;
}
if
(
canWrite
)
{
//
Write
the
data
to
global
memory,
then
mark
this
warp
as
done.
if
(
writeX
>
-1
)
{
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
force.xyz
;
}
if
(
writeY
>
-1
)
{
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset]
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0.0f
)
;
}
done
=
true
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
-1
,
-1
)
;
}
}
}
}
lasty
=
y
;
lasty
=
y
;
pos++
;
pos++
;
}
}
while
(
pos
<
end
)
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment