Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
7943a339
Commit
7943a339
authored
May 09, 2011
by
Peter Eastman
Browse files
Restructured the use of force buffers in a new way that hopefully really works everywhere.
parent
13ef0ee8
Changes
15
Show whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
1067 additions
and
961 deletions
+1067
-961
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+16
-17
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+8
-17
platforms/opencl/src/OpenCLNonbondedUtilities.h
platforms/opencl/src/OpenCLNonbondedUtilities.h
+0
-7
platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
+1
-1
platforms/opencl/src/kernels/customGBEnergyN2_default.cl
platforms/opencl/src/kernels/customGBEnergyN2_default.cl
+1
-1
platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
+170
-149
platforms/opencl/src/kernels/customGBValueN2_cpu.cl
platforms/opencl/src/kernels/customGBValueN2_cpu.cl
+1
-1
platforms/opencl/src/kernels/customGBValueN2_default.cl
platforms/opencl/src/kernels/customGBValueN2_default.cl
+1
-1
platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
+196
-175
platforms/opencl/src/kernels/gbsaObc_cpu.cl
platforms/opencl/src/kernels/gbsaObc_cpu.cl
+2
-2
platforms/opencl/src/kernels/gbsaObc_default.cl
platforms/opencl/src/kernels/gbsaObc_default.cl
+2
-2
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+430
-372
platforms/opencl/src/kernels/nonbonded_cpu.cl
platforms/opencl/src/kernels/nonbonded_cpu.cl
+1
-1
platforms/opencl/src/kernels/nonbonded_default.cl
platforms/opencl/src/kernels/nonbonded_default.cl
+1
-1
platforms/opencl/src/kernels/nonbonded_nvidia.cl
platforms/opencl/src/kernels/nonbonded_nvidia.cl
+237
-214
No files found.
platforms/opencl/src/OpenCLKernels.cpp
View file @
7943a339
...
@@ -1738,6 +1738,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1738,6 +1738,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
defines
[
"NUM_ATOMS"
]
=
intToString
(
cl
.
getNumAtoms
());
defines
[
"NUM_ATOMS"
]
=
intToString
(
cl
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
intToString
(
cl
.
getPaddedNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
intToString
(
cl
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
cl
.
getNumAtomBlocks
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
cl
.
getNumAtomBlocks
());
if
(
cl
.
getSIMDWidth
()
==
32
)
defines
[
"WARPS_PER_GROUP"
]
=
OpenCLExpressionUtilities
::
intToString
(
cl
.
getNonbondedUtilities
().
getForceThreadBlockSize
()
/
OpenCLContext
::
TileSize
);
string
file
;
string
file
;
if
(
deviceIsCpu
)
if
(
deviceIsCpu
)
file
=
OpenCLKernelSources
::
gbsaObc_cpu
;
file
=
OpenCLKernelSources
::
gbsaObc_cpu
;
...
@@ -1753,7 +1755,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1753,7 +1755,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
params
->
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
params
->
getDeviceBuffer
());
computeBornSumKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
13
*
sizeof
(
cl_float
),
NULL
);
computeBornSumKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
13
*
sizeof
(
cl_float
),
NULL
);
computeBornSumKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
1
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float
),
NULL
);
computeBornSumKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
1
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float
),
NULL
);
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getForceBufferFlags
().
getDeviceBuffer
());
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
@@ -1773,7 +1774,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1773,7 +1774,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
bornForce
->
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
bornForce
->
getDeviceBuffer
());
force1Kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
13
*
sizeof
(
cl_float
),
NULL
);
force1Kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
13
*
sizeof
(
cl_float
),
NULL
);
force1Kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
1
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
mm_float4
),
NULL
);
force1Kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
1
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
mm_float4
),
NULL
);
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getForceBufferFlags
().
getDeviceBuffer
());
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
@@ -1805,14 +1805,14 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1805,14 +1805,14 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornForceKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
obcChain
->
getDeviceBuffer
());
reduceBornForceKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
obcChain
->
getDeviceBuffer
());
}
}
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
computeBornSumKernel
.
setArg
<
mm_float4
>
(
8
,
cl
.
getPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
7
,
cl
.
getPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
9
,
cl
.
getInvPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
8
,
cl
.
getInvPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
10
,
cl
.
getPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
9
,
cl
.
getPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
1
1
,
cl
.
getInvPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
1
0
,
cl
.
getInvPeriodicBoxSize
());
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
computeBornSumKernel
.
setArg
<
cl_uint
>
(
10
,
maxTiles
);
computeBornSumKernel
.
setArg
<
cl_uint
>
(
10
,
maxTiles
);
force1Kernel
.
setArg
<
cl_uint
>
(
1
2
,
maxTiles
);
force1Kernel
.
setArg
<
cl_uint
>
(
1
1
,
maxTiles
);
}
}
}
}
cl
.
executeKernel
(
computeBornSumKernel
,
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
cl
.
executeKernel
(
computeBornSumKernel
,
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
...
@@ -2148,7 +2148,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
...
@@ -2148,7 +2148,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
}
}
map
<
string
,
string
>
replacements
;
map
<
string
,
string
>
replacements
;
replacements
[
"COMPUTE_INTERACTION"
]
=
n2EnergySource
.
str
();
replacements
[
"COMPUTE_INTERACTION"
]
=
n2EnergySource
.
str
();
stringstream
extraArgs
,
loadLocal1
,
loadLocal2
,
clearLocal
,
load1
,
load2
,
recordDeriv
,
storeDerivs1
,
storeDerivs2
,
declareTemps
,
setTemps
;
stringstream
extraArgs
,
loadLocal1
,
loadLocal2
,
clearLocal
,
load1
,
load2
,
declare1
,
recordDeriv
,
storeDerivs1
,
storeDerivs2
,
declareTemps
,
setTemps
;
if
(
force
.
getNumGlobalParameters
()
>
0
)
if
(
force
.
getNumGlobalParameters
()
>
0
)
extraArgs
<<
", __constant float* globals"
;
extraArgs
<<
", __constant float* globals"
;
for
(
int
i
=
0
;
i
<
(
int
)
params
->
getBuffers
().
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
params
->
getBuffers
().
size
();
i
++
)
{
...
@@ -2174,7 +2174,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
...
@@ -2174,7 +2174,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
string
index
=
intToString
(
i
+
1
);
string
index
=
intToString
(
i
+
1
);
extraArgs
<<
", __global "
<<
buffer
.
getType
()
<<
"* derivBuffers"
<<
index
<<
", __local "
<<
buffer
.
getType
()
<<
"* local_deriv"
<<
index
;
extraArgs
<<
", __global "
<<
buffer
.
getType
()
<<
"* derivBuffers"
<<
index
<<
", __local "
<<
buffer
.
getType
()
<<
"* local_deriv"
<<
index
;
clearLocal
<<
"local_deriv"
<<
index
<<
"[localAtomIndex] = 0.0f;
\n
"
;
clearLocal
<<
"local_deriv"
<<
index
<<
"[localAtomIndex] = 0.0f;
\n
"
;
load
1
<<
buffer
.
getType
()
<<
" deriv"
<<
index
<<
"_1 = 0.0f;
\n
"
;
declare
1
<<
buffer
.
getType
()
<<
" deriv"
<<
index
<<
"_1 = 0.0f;
\n
"
;
load2
<<
buffer
.
getType
()
<<
" deriv"
<<
index
<<
"_2 = 0.0f;
\n
"
;
load2
<<
buffer
.
getType
()
<<
" deriv"
<<
index
<<
"_2 = 0.0f;
\n
"
;
recordDeriv
<<
"local_deriv"
<<
index
<<
"[atom2] += deriv"
<<
index
<<
"_2;
\n
"
;
recordDeriv
<<
"local_deriv"
<<
index
<<
"[atom2] += deriv"
<<
index
<<
"_2;
\n
"
;
storeDerivs1
<<
"STORE_DERIVATIVE_1("
<<
index
<<
")"
;
storeDerivs1
<<
"STORE_DERIVATIVE_1("
<<
index
<<
")"
;
...
@@ -2188,6 +2188,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
...
@@ -2188,6 +2188,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
replacements
[
"CLEAR_LOCAL_DERIVATIVES"
]
=
clearLocal
.
str
();
replacements
[
"CLEAR_LOCAL_DERIVATIVES"
]
=
clearLocal
.
str
();
replacements
[
"LOAD_ATOM1_PARAMETERS"
]
=
load1
.
str
();
replacements
[
"LOAD_ATOM1_PARAMETERS"
]
=
load1
.
str
();
replacements
[
"LOAD_ATOM2_PARAMETERS"
]
=
load2
.
str
();
replacements
[
"LOAD_ATOM2_PARAMETERS"
]
=
load2
.
str
();
replacements
[
"DECLARE_ATOM1_DERIVATIVES"
]
=
declare1
.
str
();
replacements
[
"RECORD_DERIVATIVE_2"
]
=
recordDeriv
.
str
();
replacements
[
"RECORD_DERIVATIVE_2"
]
=
recordDeriv
.
str
();
replacements
[
"STORE_DERIVATIVES_1"
]
=
storeDerivs1
.
str
();
replacements
[
"STORE_DERIVATIVES_1"
]
=
storeDerivs1
.
str
();
replacements
[
"STORE_DERIVATIVES_2"
]
=
storeDerivs2
.
str
();
replacements
[
"STORE_DERIVATIVES_2"
]
=
storeDerivs2
.
str
();
...
@@ -2482,7 +2483,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
...
@@ -2482,7 +2483,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
valueBuffers
->
getDeviceBuffer
());
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
valueBuffers
->
getDeviceBuffer
());
pairValueKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float
),
NULL
);
pairValueKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float
),
NULL
);
pairValueKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float
),
NULL
);
pairValueKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float
),
NULL
);
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getForceBufferFlags
().
getDeviceBuffer
());
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
@@ -2531,7 +2531,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
...
@@ -2531,7 +2531,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusionIndices
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusionIndices
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusionRowIndices
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusionRowIndices
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float4
),
NULL
);
pairEnergyKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float4
),
NULL
);
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getForceBufferFlags
().
getDeviceBuffer
());
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
@@ -2609,14 +2608,14 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
...
@@ -2609,14 +2608,14 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
globals
->
upload
(
globalParamValues
);
globals
->
upload
(
globalParamValues
);
}
}
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
pairValueKernel
.
setArg
<
mm_float4
>
(
1
1
,
cl
.
getPeriodicBoxSize
());
pairValueKernel
.
setArg
<
mm_float4
>
(
1
0
,
cl
.
getPeriodicBoxSize
());
pairValueKernel
.
setArg
<
mm_float4
>
(
1
2
,
cl
.
getInvPeriodicBoxSize
());
pairValueKernel
.
setArg
<
mm_float4
>
(
1
1
,
cl
.
getInvPeriodicBoxSize
());
pairEnergyKernel
.
setArg
<
mm_float4
>
(
1
2
,
cl
.
getPeriodicBoxSize
());
pairEnergyKernel
.
setArg
<
mm_float4
>
(
1
1
,
cl
.
getPeriodicBoxSize
());
pairEnergyKernel
.
setArg
<
mm_float4
>
(
1
3
,
cl
.
getInvPeriodicBoxSize
());
pairEnergyKernel
.
setArg
<
mm_float4
>
(
1
2
,
cl
.
getInvPeriodicBoxSize
());
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
pairValueKernel
.
setArg
<
cl_uint
>
(
1
3
,
maxTiles
);
pairValueKernel
.
setArg
<
cl_uint
>
(
1
2
,
maxTiles
);
pairEnergyKernel
.
setArg
<
cl_uint
>
(
1
4
,
maxTiles
);
pairEnergyKernel
.
setArg
<
cl_uint
>
(
1
3
,
maxTiles
);
}
}
}
}
cl
.
executeKernel
(
pairValueKernel
,
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
cl
.
executeKernel
(
pairValueKernel
,
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
7943a339
...
@@ -37,7 +37,7 @@ using namespace std;
...
@@ -37,7 +37,7 @@ using namespace std;
OpenCLNonbondedUtilities
::
OpenCLNonbondedUtilities
(
OpenCLContext
&
context
)
:
context
(
context
),
cutoff
(
-
1.0
),
useCutoff
(
false
),
OpenCLNonbondedUtilities
::
OpenCLNonbondedUtilities
(
OpenCLContext
&
context
)
:
context
(
context
),
cutoff
(
-
1.0
),
useCutoff
(
false
),
numForceBuffers
(
0
),
exclusionIndices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactionFlags
(
NULL
),
numForceBuffers
(
0
),
exclusionIndices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactionFlags
(
NULL
),
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
)
,
forceBufferFlags
(
NULL
)
{
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
)
{
// Decide how many thread blocks and force buffers to use.
// Decide how many thread blocks and force buffers to use.
deviceIsCpu
=
(
context
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
);
deviceIsCpu
=
(
context
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
);
...
@@ -48,8 +48,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
...
@@ -48,8 +48,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
numForceBuffers
=
numForceThreadBlocks
;
numForceBuffers
=
numForceThreadBlocks
;
}
}
else
if
(
context
.
getSIMDWidth
()
==
32
)
{
else
if
(
context
.
getSIMDWidth
()
==
32
)
{
numForceThreadBlocks
=
2
*
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_COMPUTE_UNITS
>
();
numForceThreadBlocks
=
4
*
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_COMPUTE_UNITS
>
();
forceThreadBlockSize
=
256
;
forceThreadBlockSize
=
128
;
numForceBuffers
=
numForceThreadBlocks
;
numForceBuffers
=
numForceThreadBlocks
;
}
}
else
{
else
{
...
@@ -82,8 +82,6 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
...
@@ -82,8 +82,6 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
delete
blockCenter
;
delete
blockCenter
;
if
(
blockBoundingBox
!=
NULL
)
if
(
blockBoundingBox
!=
NULL
)
delete
blockBoundingBox
;
delete
blockBoundingBox
;
if
(
forceBufferFlags
!=
NULL
)
delete
forceBufferFlags
;
}
}
void
OpenCLNonbondedUtilities
::
addInteraction
(
bool
usesCutoff
,
bool
usesPeriodic
,
bool
usesExclusions
,
double
cutoffDistance
,
const
vector
<
vector
<
int
>
>&
exclusionList
,
const
string
&
kernel
)
{
void
OpenCLNonbondedUtilities
::
addInteraction
(
bool
usesCutoff
,
bool
usesPeriodic
,
bool
usesExclusions
,
double
cutoffDistance
,
const
vector
<
vector
<
int
>
>&
exclusionList
,
const
string
&
kernel
)
{
...
@@ -239,12 +237,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
...
@@ -239,12 +237,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
interactionCount
->
upload
();
interactionCount
->
upload
();
}
}
// Create the flags for reserving force buffers.
forceBufferFlags
=
new
OpenCLArray
<
cl_uint
>
(
context
,
numAtomBlocks
*
numForceThreadBlocks
,
"forceBufferFlags"
,
false
);
vector
<
cl_uint
>
forceBufferFlagsVec
(
forceBufferFlags
->
getSize
(),
0
);
forceBufferFlags
->
upload
(
forceBufferFlagsVec
);
// Create kernels.
// Create kernels.
forceKernel
=
createInteractionKernel
(
kernelSource
,
parameters
,
arguments
,
true
,
true
);
forceKernel
=
createInteractionKernel
(
kernelSource
,
parameters
,
arguments
,
true
,
true
);
...
@@ -320,8 +312,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
...
@@ -320,8 +312,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
if
(
cutoff
!=
-
1.0
)
{
if
(
cutoff
!=
-
1.0
)
{
if
(
useCutoff
)
{
if
(
useCutoff
)
{
forceKernel
.
setArg
<
mm_float4
>
(
1
3
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
2
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
4
,
context
.
getInvPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
3
,
context
.
getInvPeriodicBoxSize
());
}
}
context
.
executeKernel
(
forceKernel
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
context
.
executeKernel
(
forceKernel
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
}
}
...
@@ -343,14 +335,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
...
@@ -343,14 +335,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
newSize
=
numTiles
;
newSize
=
numTiles
;
delete
interactingTiles
;
delete
interactingTiles
;
interactingTiles
=
new
OpenCLArray
<
mm_ushort2
>
(
context
,
newSize
,
"interactingTiles"
);
interactingTiles
=
new
OpenCLArray
<
mm_ushort2
>
(
context
,
newSize
,
"interactingTiles"
);
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
1
,
interactingTiles
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
0
,
interactingTiles
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl_uint
>
(
1
5
,
newSize
);
forceKernel
.
setArg
<
cl_uint
>
(
1
4
,
newSize
);
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
interactingTiles
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
interactingTiles
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
9
,
newSize
);
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
9
,
newSize
);
if
(
context
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
{
if
(
context
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
{
delete
interactionFlags
;
delete
interactionFlags
;
interactionFlags
=
new
OpenCLArray
<
cl_uint
>
(
context
,
deviceIsCpu
?
2
*
newSize
:
newSize
,
"interactionFlags"
);
interactionFlags
=
new
OpenCLArray
<
cl_uint
>
(
context
,
deviceIsCpu
?
2
*
newSize
:
newSize
,
"interactionFlags"
);
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
6
,
interactionFlags
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
5
,
interactionFlags
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
interactingTiles
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
interactingTiles
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
...
@@ -503,7 +495,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
...
@@ -503,7 +495,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel
.
setArg
(
index
++
,
4
*
forceThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
kernel
.
setArg
(
index
++
,
4
*
forceThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
+
numTiles
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
+
numTiles
);
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
forceBufferFlags
->
getDeviceBuffer
());
if
(
useCutoff
)
{
if
(
useCutoff
)
{
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
interactingTiles
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
interactingTiles
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
interactionCount
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
interactionCount
->
getDeviceBuffer
());
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.h
View file @
7943a339
...
@@ -196,12 +196,6 @@ public:
...
@@ -196,12 +196,6 @@ public:
OpenCLArray
<
cl_uint
>&
getExclusionRowIndices
()
{
OpenCLArray
<
cl_uint
>&
getExclusionRowIndices
()
{
return
*
exclusionRowIndices
;
return
*
exclusionRowIndices
;
}
}
/**
* Get the array which contains flags for reserving force buffers.
*/
OpenCLArray
<
cl_uint
>&
getForceBufferFlags
()
{
return
*
forceBufferFlags
;
}
/**
/**
* Get the index of the first tile this context is responsible for processing.
* Get the index of the first tile this context is responsible for processing.
*/
*/
...
@@ -245,7 +239,6 @@ private:
...
@@ -245,7 +239,6 @@ private:
OpenCLArray
<
cl_uint
>*
interactionCount
;
OpenCLArray
<
cl_uint
>*
interactionCount
;
OpenCLArray
<
mm_float4
>*
blockCenter
;
OpenCLArray
<
mm_float4
>*
blockCenter
;
OpenCLArray
<
mm_float4
>*
blockBoundingBox
;
OpenCLArray
<
mm_float4
>*
blockBoundingBox
;
OpenCLArray
<
cl_uint
>*
forceBufferFlags
;
std
::
vector
<
std
::
vector
<
int
>
>
atomExclusions
;
std
::
vector
<
std
::
vector
<
int
>
>
atomExclusions
;
std
::
vector
<
ParameterInfo
>
parameters
;
std
::
vector
<
ParameterInfo
>
parameters
;
std
::
vector
<
ParameterInfo
>
arguments
;
std
::
vector
<
ParameterInfo
>
arguments
;
...
...
platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
View file @
7943a339
...
@@ -8,7 +8,7 @@
...
@@ -8,7 +8,7 @@
__kernel
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
__kernel
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempBuffer,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
#
else
...
...
platforms/opencl/src/kernels/customGBEnergyN2_default.cl
View file @
7943a339
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempForceBuffer,
__global
unsigned
int*
forceBufferFlags,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempForceBuffer,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
#
else
#
else
...
...
platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
View file @
7943a339
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
define
TILE_SIZE
32
#
define
TILE_SIZE
32
#
define
STORE_DERIVATIVE_1
(
INDEX
)
derivBuffers##INDEX[offset1]
+=
deriv##INDEX##_1
;
#
define
STORE_DERIVATIVE_1
(
INDEX
)
derivBuffers##INDEX[offset]
+=
deriv##INDEX##_1
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
derivBuffers##INDEX[offset2]
+=
local_deriv##INDEX[get_local_id
(
0
)
]
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
derivBuffers##INDEX[offset]
+=
local_deriv##INDEX[get_local_id
(
0
)
]
;
/**
*
Mark
that
a
block
in
the
force
buffer
is
in
use.
*/
void
reserveBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
while
(
atom_cmpxchg
(
&forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
],
0
,
1
)
!=
0
)
;
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
}
/**
*
Mark
that
a
block
in
the
force
buffer
is
no
longer
in
use.
*/
void
releaseBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
]
=
0
;
}
/**
/**
*
Compute
a
force
based
on
pair
interactions.
*
Compute
a
force
based
on
pair
interactions.
*/
*/
__kernel
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
__kernel
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempBuffer,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
#
else
...
@@ -48,10 +29,17 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
...
@@ -48,10 +29,17 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
while
(
pos
<
end
)
{
do
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
unsigned
int
x,
y
;
unsigned
int
x,
y
;
float4
force
=
0.0f
;
DECLARE_ATOM1_DERIVATIVES
if
(
pos
<
end
)
{
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
ushort2
tileIndices
=
tiles[pos]
;
...
@@ -68,11 +56,7 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
...
@@ -68,11 +56,7 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
}
}
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx;
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
...
@@ -90,7 +74,9 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
...
@@ -90,7 +74,9 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
#else
#else
bool hasExclusions = false;
bool hasExclusions = false;
#endif
#endif
if (x == y) {
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
const unsigned int localAtomIndex = get_local_id(0);
...
@@ -134,14 +120,6 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
...
@@ -134,14 +120,6 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
excl >>= 1;
excl >>= 1;
#endif
#endif
}
}
// Write results
reserveBuffer(x, forceBufferFlags);
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
STORE_DERIVATIVES_1
releaseBuffer(x, forceBufferFlags);
}
}
else {
else {
// This is an off-diagonal tile.
// This is an off-diagonal tile.
...
@@ -209,22 +187,65 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
...
@@ -209,22 +187,65 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
}
}
}
}
}
}
lasty
=
y
;
//
Write
results.
We
need
to
coordinate
between
warps
to
make
sure
no
two
of
them
//
ever
try
to
write
to
the
same
piece
of
memory
at
the
same
time.
int
writeX
=
(
pos
<
end
?
x
:
-1
)
;
int
writeY
=
(
pos
<
end
&&
x
!=
y
?
y
:
-1
)
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
writeX,
writeY
)
;
bool
done
=
false
;
int
doneIndex
=
0
;
int
checkIndex
=
0
;
while
(
true
)
{
//
See
if
any
warp
still
needs
to
write
its
data.
bool
allDone
=
true
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
while
(
doneIndex
<
WARPS_PER_GROUP
&&
allDone
)
{
if
(
reservedBlocks[doneIndex].x
!=
-1
)
allDone
=
false
;
else
doneIndex++
;
}
if
(
allDone
)
break
;
if
(
!done
)
{
//
See
whether
this
warp
can
write
its
data.
This
requires
that
no
previous
warp
//
is
trying
to
write
to
the
same
block
of
the
buffer.
//
Write
results
bool
canWrite
=
(
writeX
!=
-1
)
;
while
(
checkIndex
<
localGroupIndex
&&
canWrite
)
{
if
((
reservedBlocks[checkIndex].x
==
x
|
| reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y |
|
reservedBlocks[checkIndex].y
==
y
)))
canWrite
=
false
;
else
checkIndex++
;
}
if
(
canWrite
)
{
//
Write
the
data
to
global
memory,
then
mark
this
warp
as
done.
reserveBuffer
(
x,
forceBufferFlags
)
;
if
(
writeX
>
-1
)
{
unsigned
int
offset
1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset
1
].xyz
+=
force.xyz
;
forceBuffers[offset].xyz
+=
force.xyz
;
STORE_DERIVATIVES_1
STORE_DERIVATIVES_1
releaseBuffer
(
x,
forceBufferFlags
)
;
}
reserveBuffer
(
y,
forceBufferFlags
)
;
if
(
writeY
>
-1
)
{
unsigned
int
offset
2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset
2
].xyz
+=
local_force[get_local_id
(
0
)
].xyz
;
forceBuffers[offset].xyz
+=
local_force[get_local_id
(
0
)
].xyz
;
STORE_DERIVATIVES_2
STORE_DERIVATIVES_2
releaseBuffer
(
y,
forceBufferFlags
)
;
}
}
lasty
=
y
;
done
=
true
;
pos++
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
-1
,
-1
)
;
}
}
}
}
pos++
;
}
while
(
pos
<
end
)
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
}
platforms/opencl/src/kernels/customGBValueN2_cpu.cl
View file @
7943a339
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
__kernel
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__kernel
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__global
float*
global_value,
__local
float*
local_value,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__global
float*
global_value,
__local
float*
local_value,
__local
float*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
#
else
...
...
platforms/opencl/src/kernels/customGBValueN2_default.cl
View file @
7943a339
...
@@ -7,7 +7,7 @@
...
@@ -7,7 +7,7 @@
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__global
float*
global_value,
__local
float*
local_value,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__global
float*
global_value,
__local
float*
local_value,
__local
float*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
#
else
#
else
...
...
platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
View file @
7943a339
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
define
TILE_SIZE
32
#
define
TILE_SIZE
32
/**
*
Mark
that
a
block
in
the
value
buffer
is
in
use.
*/
void
reserveBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
while
(
atom_cmpxchg
(
&forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
],
0
,
1
)
!=
0
)
;
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
}
/**
*
Mark
that
a
block
in
the
value
buffer
is
no
longer
in
use.
*/
void
releaseBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
]
=
0
;
}
/**
/**
*
Compute
a
value
based
on
pair
interactions.
*
Compute
a
value
based
on
pair
interactions.
*/
*/
__kernel
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__kernel
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__global
float*
global_value,
__local
float*
local_value,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__global
float*
global_value,
__local
float*
local_value,
__local
float*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
#
else
...
@@ -46,10 +27,16 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
...
@@ -46,10 +27,16 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
while
(
pos
<
end
)
{
do
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
unsigned
int
x,
y
;
unsigned
int
x,
y
;
float
value
=
0.0f
;
if
(
pos
<
end
)
{
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
ushort2
tileIndices
=
tiles[pos]
;
...
@@ -66,11 +53,7 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
...
@@ -66,11 +53,7 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
}
}
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx;
unsigned int atom1 = x*TILE_SIZE + tgx;
float value = 0.0f;
float4 posq1 = posq[atom1];
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
...
@@ -88,7 +71,9 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
...
@@ -88,7 +71,9 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
#else
#else
bool hasExclusions = false;
bool hasExclusions = false;
#endif
#endif
if (x == y) {
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
const unsigned int localAtomIndex = get_local_id(0);
...
@@ -133,13 +118,6 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
...
@@ -133,13 +118,6 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
excl >>= 1;
excl >>= 1;
#endif
#endif
}
}
// Write results
reserveBuffer(x, forceBufferFlags);
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset] += value;
releaseBuffer(x, forceBufferFlags);
}
}
else {
else {
// This is an off-diagonal tile.
// This is an off-diagonal tile.
...
@@ -249,19 +227,62 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
...
@@ -249,19 +227,62 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
}
}
}
}
}
}
//
Write
results.
We
need
to
coordinate
between
warps
to
make
sure
no
two
of
them
//
ever
try
to
write
to
the
same
piece
of
memory
at
the
same
time.
int
writeX
=
(
pos
<
end
?
x
:
-1
)
;
int
writeY
=
(
pos
<
end
&&
x
!=
y
?
y
:
-1
)
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
writeX,
writeY
)
;
bool
done
=
false
;
int
doneIndex
=
0
;
int
checkIndex
=
0
;
while
(
true
)
{
//
See
if
any
warp
still
needs
to
write
its
data.
bool
allDone
=
true
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
while
(
doneIndex
<
WARPS_PER_GROUP
&&
allDone
)
{
if
(
reservedBlocks[doneIndex].x
!=
-1
)
allDone
=
false
;
else
doneIndex++
;
}
if
(
allDone
)
break
;
if
(
!done
)
{
//
See
whether
this
warp
can
write
its
data.
This
requires
that
no
previous
warp
//
is
trying
to
write
to
the
same
block
of
the
buffer.
//
Write
results
bool
canWrite
=
(
writeX
!=
-1
)
;
while
(
checkIndex
<
localGroupIndex
&&
canWrite
)
{
if
((
reservedBlocks[checkIndex].x
==
x
|
| reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y |
|
reservedBlocks[checkIndex].y
==
y
)))
canWrite
=
false
;
else
checkIndex++
;
}
if
(
canWrite
)
{
//
Write
the
data
to
global
memory,
then
mark
this
warp
as
done.
reserveBuffer
(
x,
forceBufferFlags
)
;
if
(
writeX
>
-1
)
{
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset1]
+=
value
;
global_value[offset]
+=
value
;
releaseBuffer
(
x,
forceBufferFlags
)
;
}
reserveBuffer
(
y,
forceBufferFlags
)
;
if
(
writeY
>
-1
)
{
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset2]
+=
local_value[get_local_id
(
0
)
]
;
global_value[offset]
+=
local_value[get_local_id
(
0
)
]
;
releaseBuffer
(
y,
forceBufferFlags
)
;
}
done
=
true
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
-1
,
-1
)
;
}
}
}
}
lasty
=
y
;
lasty
=
y
;
pos++
;
pos++
;
}
}
while
(
pos
<
end
)
;
}
}
platforms/opencl/src/kernels/gbsaObc_cpu.cl
View file @
7943a339
...
@@ -15,7 +15,7 @@ typedef struct {
...
@@ -15,7 +15,7 @@ typedef struct {
*/
*/
__kernel
void
computeBornSum
(
__global
float*
global_bornSum,
__global
float4*
posq,
__global
float2*
global_params,
__kernel
void
computeBornSum
(
__global
float*
global_bornSum,
__global
float4*
posq,
__global
float2*
global_params,
__local
AtomData*
localData,
__local
float*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__local
AtomData*
localData,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
)
{
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
)
{
#
else
#
else
...
@@ -192,7 +192,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
...
@@ -192,7 +192,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__local AtomData* localData, __local float4* tempBuffer,
__global unsigned int* forceBufferFlags,
__local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
#else
#else
...
...
platforms/opencl/src/kernels/gbsaObc_default.cl
View file @
7943a339
...
@@ -16,7 +16,7 @@ typedef struct {
...
@@ -16,7 +16,7 @@ typedef struct {
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeBornSum
(
__global
float*
global_bornSum,
__global
float4*
posq,
__global
float2*
global_params,
void
computeBornSum
(
__global
float*
global_bornSum,
__global
float4*
posq,
__global
float2*
global_params,
__local
AtomData*
localData,
__local
float*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__local
AtomData*
localData,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
)
{
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
)
{
#
else
#
else
...
@@ -203,7 +203,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
...
@@ -203,7 +203,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__local AtomData* localData, __local float4* tempBuffer,
__global unsigned int* forceBufferFlags,
__local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else
#else
...
...
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
View file @
7943a339
...
@@ -11,30 +11,11 @@ typedef struct {
...
@@ -11,30 +11,11 @@ typedef struct {
float
bornForce
;
float
bornForce
;
}
AtomData
;
}
AtomData
;
/**
*
Mark
that
a
block
in
the
force
buffer
is
in
use.
*/
void
reserveBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
while
(
atom_cmpxchg
(
&forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
],
0
,
1
)
!=
0
)
;
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
}
/**
*
Mark
that
a
block
in
the
force
buffer
is
no
longer
in
use.
*/
void
releaseBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
]
=
0
;
}
/**
/**
*
Compute
the
Born
sum.
*
Compute
the
Born
sum.
*/
*/
__kernel
void
computeBornSum
(
__global
float*
global_bornSum,
__global
float4*
posq,
__global
float2*
global_params,
__kernel
void
computeBornSum
(
__global
float*
global_bornSum,
__global
float4*
posq,
__global
float2*
global_params,
__local
AtomData*
localData,
__local
float*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__local
AtomData*
localData,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
)
{
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
)
{
#
else
#
else
...
@@ -51,10 +32,16 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
...
@@ -51,10 +32,16 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
#
endif
#
endif
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
int2
reservedBlocks[WARPS_PER_GROUP]
;
while
(
pos
<
end
)
{
do
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
unsigned
int
x,
y
;
unsigned
int
x,
y
;
float
bornSum
=
0.0f
;
if
(
pos
<
end
)
{
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
ushort2
tileIndices
=
tiles[pos]
;
...
@@ -71,14 +58,12 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
...
@@ -71,14 +58,12 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
}
}
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx;
unsigned int atom1 = x*TILE_SIZE + tgx;
float bornSum = 0.0f;
float4 posq1 = posq[atom1];
float4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
float2 params1 = global_params[atom1];
if (x == y) {
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
// This tile is on the diagonal.
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].x = posq1.x;
...
@@ -117,13 +102,6 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
...
@@ -117,13 +102,6 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
}
}
}
}
}
}
// Write results
reserveBuffer(x, forceBufferFlags);
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
releaseBuffer(x, forceBufferFlags);
}
}
else {
else {
// This is an off-diagonal tile.
// This is an off-diagonal tile.
...
@@ -261,21 +239,64 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
...
@@ -261,21 +239,64 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
tj = (tj + 1) & (TILE_SIZE - 1);
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
}
}
// Write results
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
reserveBuffer(x, forceBufferFlags);
int writeX = (pos < end ? x : -1);
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
int writeY = (pos < end && x != y ? y : -1);
global_bornSum[offset1] += bornSum;
if (tgx == 0)
releaseBuffer(x, forceBufferFlags);
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
reserveBuffer(y, forceBufferFlags);
bool done = false;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
int doneIndex = 0;
global_bornSum[offset2] += localData[get_local_id(0)].bornSum;
int checkIndex = 0;
releaseBuffer(y, forceBufferFlags);
while (true) {
// See if any warp still needs to write its data.
bool allDone = true;
barrier(CLK_LOCAL_MEM_FENCE);
while (doneIndex < WARPS_PER_GROUP && allDone) {
if (reservedBlocks[doneIndex].x != -1)
allDone = false;
else
doneIndex++;
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
bool canWrite = (writeX != -1);
while (checkIndex < localGroupIndex && canWrite) {
if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
canWrite = false;
else
checkIndex++;
}
if (canWrite) {
// Write the data to global memory, then mark this warp as done.
if (writeX > -1) {
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
}
if (writeY > -1) {
const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += localData[get_local_id(0)].bornSum;
}
done = true;
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(-1, -1);
}
}
}
}
lasty = y;
lasty = y;
pos++;
pos++;
}
}
while (pos < end);
}
}
/**
/**
...
@@ -284,7 +305,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
...
@@ -284,7 +305,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__local AtomData* localData, __local float4* tempBuffer,
__global unsigned int* forceBufferFlags,
__local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
#else
#else
...
@@ -302,10 +323,16 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
...
@@ -302,10 +323,16 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
#endif
#endif
float energy = 0.0f;
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
unsigned int lasty = 0xFFFFFFFF;
__local int2 reservedBlocks[WARPS_PER_GROUP];
while (pos < end)
{
do
{
// Extract the coordinates of this tile
// Extract the coordinates of this tile
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y;
unsigned int x, y;
float4 force = 0.0f;
if (pos < end) {
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
ushort2 tileIndices = tiles[pos];
...
@@ -322,11 +349,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
...
@@ -322,11 +349,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
}
}
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx;
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
float4 posq1 = posq[atom1];
float bornRadius1 = global_bornRadii[atom1];
float bornRadius1 = global_bornRadii[atom1];
if (x == y) {
if (x == y) {
...
@@ -372,14 +395,6 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
...
@@ -372,14 +395,6 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
force.xyz -= delta.xyz;
force.xyz -= delta.xyz;
}
}
}
}
// Write results
reserveBuffer(x, forceBufferFlags);
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
global_bornForce[offset] += force.w;
releaseBuffer(x, forceBufferFlags);
}
}
else {
else {
// This is an off-diagonal tile.
// This is an off-diagonal tile.
...
@@ -511,22 +526,65 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
...
@@ -511,22 +526,65 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
tj = (tj + 1) & (TILE_SIZE - 1);
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
}
}
//
Write
results
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
reserveBuffer
(
x,
forceBufferFlags
)
;
int writeX = (pos < end ? x : -1);
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
int writeY = (pos < end && x != y ? y : -1);
forceBuffers[offset1].xyz
+=
force.xyz
;
if (tgx == 0)
global_bornForce[offset1]
+=
force.w
;
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
releaseBuffer
(
x,
forceBufferFlags
)
;
bool done = false;
reserveBuffer
(
y,
forceBufferFlags
)
;
int doneIndex = 0;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
int checkIndex = 0;
forceBuffers[offset2]
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0
)
;
while (true) {
global_bornForce[offset2]
+=
localData[get_local_id
(
0
)
].fw
;
// See if any warp still needs to write its data.
releaseBuffer
(
y,
forceBufferFlags
)
;
bool allDone = true;
barrier(CLK_LOCAL_MEM_FENCE);
while (doneIndex < WARPS_PER_GROUP && allDone) {
if (reservedBlocks[doneIndex].x != -1)
allDone = false;
else
doneIndex++;
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
bool canWrite = (writeX != -1);
while (checkIndex < localGroupIndex && canWrite) {
if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y |
|
reservedBlocks[checkIndex].y
==
y
)
))
canWrite
=
false
;
else
checkIndex++
;
}
if
(
canWrite
)
{
//
Write
the
data
to
global
memory,
then
mark
this
warp
as
done.
if
(
writeX
>
-1
)
{
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
force.xyz
;
global_bornForce[offset]
+=
force.w
;
}
if
(
writeY
>
-1
)
{
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset]
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0.0f
)
;
global_bornForce[offset]
+=
localData[get_local_id
(
0
)
].fw
;
}
done
=
true
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
-1
,
-1
)
;
}
}
}
}
lasty
=
y
;
lasty
=
y
;
pos++
;
pos++
;
}
}
while
(
pos
<
end
)
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
}
platforms/opencl/src/kernels/nonbonded_cpu.cl
View file @
7943a339
...
@@ -13,7 +13,7 @@ typedef struct {
...
@@ -13,7 +13,7 @@ typedef struct {
__kernel
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__kernel
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
__global
unsigned
int*
forceBufferFlags,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
#
else
...
...
platforms/opencl/src/kernels/nonbonded_default.cl
View file @
7943a339
...
@@ -14,7 +14,7 @@ typedef struct {
...
@@ -14,7 +14,7 @@ typedef struct {
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
__global
unsigned
int*
forceBufferFlags,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
#
else
...
...
platforms/opencl/src/kernels/nonbonded_nvidia.cl
View file @
7943a339
...
@@ -8,31 +8,12 @@ typedef struct {
...
@@ -8,31 +8,12 @@ typedef struct {
ATOM_PARAMETER_DATA
ATOM_PARAMETER_DATA
}
AtomData
;
}
AtomData
;
/**
*
Mark
that
a
block
in
the
force
buffer
is
in
use.
*/
void
reserveBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
while
(
atom_cmpxchg
(
&forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
],
0
,
1
)
!=
0
)
;
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
}
/**
*
Mark
that
a
block
in
the
force
buffer
is
no
longer
in
use.
*/
void
releaseBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
]
=
0
;
}
/**
/**
*
Compute
nonbonded
interactions.
*
Compute
nonbonded
interactions.
*/
*/
__kernel
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__kernel
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float*
tempBuffer,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float*
tempBuffer,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
__global
unsigned
int*
forceBufferFlags,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
#
else
...
@@ -53,10 +34,16 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
...
@@ -53,10 +34,16 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
while
(
pos
<
end
)
{
do
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
unsigned
int
x,
y
;
unsigned
int
x,
y
;
float4
force
=
0.0f
;
if
(
pos
<
end
)
{
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
ushort2
tileIndices
=
tiles[pos]
;
...
@@ -73,11 +60,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
...
@@ -73,11 +60,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
}
}
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx;
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
...
@@ -95,7 +78,9 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
...
@@ -95,7 +78,9 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
#else
#else
bool hasExclusions = false;
bool hasExclusions = false;
#endif
#endif
if (x == y) {
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
const unsigned int localAtomIndex = get_local_id(0);
...
@@ -138,15 +123,10 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
...
@@ -138,15 +123,10 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
#else
#else
force.xyz -= dEdR1.xyz;
force.xyz -= dEdR1.xyz;
#endif
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
excl >>= 1;
#endif
}
}
// Write results
reserveBuffer(x, forceBufferFlags);
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
releaseBuffer(x, forceBufferFlags);
}
}
else {
else {
// This is an off-diagonal tile.
// This is an off-diagonal tile.
...
@@ -297,20 +277,63 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
...
@@ -297,20 +277,63 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
}
}
}
}
}
}
//
Write
results.
We
need
to
coordinate
between
warps
to
make
sure
no
two
of
them
//
ever
try
to
write
to
the
same
piece
of
memory
at
the
same
time.
//
Write
results
int
writeX
=
(
pos
<
end
?
x
:
-1
)
;
int
writeY
=
(
pos
<
end
&&
x
!=
y
?
y
:
-1
)
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
writeX,
writeY
)
;
bool
done
=
false
;
int
doneIndex
=
0
;
int
checkIndex
=
0
;
while
(
true
)
{
//
See
if
any
warp
still
needs
to
write
its
data.
reserveBuffer
(
x,
forceBufferFlags
)
;
bool
allDone
=
true
;
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
forceBuffers[offset1].xyz
+=
force.xyz
;
while
(
doneIndex
<
WARPS_PER_GROUP
&&
allDone
)
{
releaseBuffer
(
x,
forceBufferFlags
)
;
if
(
reservedBlocks[doneIndex].x
!=
-1
)
reserveBuffer
(
y,
forceBufferFlags
)
;
allDone
=
false
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
else
forceBuffers[offset2]
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0.0f
)
;
doneIndex++
;
releaseBuffer
(
y,
forceBufferFlags
)
;
}
if
(
allDone
)
break
;
if
(
!done
)
{
//
See
whether
this
warp
can
write
its
data.
This
requires
that
no
previous
warp
//
is
trying
to
write
to
the
same
block
of
the
buffer.
bool
canWrite
=
(
writeX
!=
-1
)
;
while
(
checkIndex
<
localGroupIndex
&&
canWrite
)
{
if
((
reservedBlocks[checkIndex].x
==
x
|
| reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y |
|
reservedBlocks[checkIndex].y
==
y
)))
canWrite
=
false
;
else
checkIndex++
;
}
if
(
canWrite
)
{
//
Write
the
data
to
global
memory,
then
mark
this
warp
as
done.
if
(
writeX
>
-1
)
{
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
force.xyz
;
}
if
(
writeY
>
-1
)
{
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset]
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0.0f
)
;
}
done
=
true
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
-1
,
-1
)
;
}
}
}
}
lasty
=
y
;
lasty
=
y
;
pos++
;
pos++
;
}
}
while
(
pos
<
end
)
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment