Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
7943a339
Commit
7943a339
authored
May 09, 2011
by
Peter Eastman
Browse files
Restructured the use of force buffers in a new way that hopefully really works everywhere.
parent
13ef0ee8
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
1067 additions
and
961 deletions
+1067
-961
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+16
-17
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+8
-17
platforms/opencl/src/OpenCLNonbondedUtilities.h
platforms/opencl/src/OpenCLNonbondedUtilities.h
+0
-7
platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
+1
-1
platforms/opencl/src/kernels/customGBEnergyN2_default.cl
platforms/opencl/src/kernels/customGBEnergyN2_default.cl
+1
-1
platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
+170
-149
platforms/opencl/src/kernels/customGBValueN2_cpu.cl
platforms/opencl/src/kernels/customGBValueN2_cpu.cl
+1
-1
platforms/opencl/src/kernels/customGBValueN2_default.cl
platforms/opencl/src/kernels/customGBValueN2_default.cl
+1
-1
platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
+196
-175
platforms/opencl/src/kernels/gbsaObc_cpu.cl
platforms/opencl/src/kernels/gbsaObc_cpu.cl
+2
-2
platforms/opencl/src/kernels/gbsaObc_default.cl
platforms/opencl/src/kernels/gbsaObc_default.cl
+2
-2
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+430
-372
platforms/opencl/src/kernels/nonbonded_cpu.cl
platforms/opencl/src/kernels/nonbonded_cpu.cl
+1
-1
platforms/opencl/src/kernels/nonbonded_default.cl
platforms/opencl/src/kernels/nonbonded_default.cl
+1
-1
platforms/opencl/src/kernels/nonbonded_nvidia.cl
platforms/opencl/src/kernels/nonbonded_nvidia.cl
+237
-214
No files found.
platforms/opencl/src/OpenCLKernels.cpp
View file @
7943a339
...
...
@@ -1738,6 +1738,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
defines
[
"NUM_ATOMS"
]
=
intToString
(
cl
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
intToString
(
cl
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
cl
.
getNumAtomBlocks
());
if
(
cl
.
getSIMDWidth
()
==
32
)
defines
[
"WARPS_PER_GROUP"
]
=
OpenCLExpressionUtilities
::
intToString
(
cl
.
getNonbondedUtilities
().
getForceThreadBlockSize
()
/
OpenCLContext
::
TileSize
);
string
file
;
if
(
deviceIsCpu
)
file
=
OpenCLKernelSources
::
gbsaObc_cpu
;
...
...
@@ -1753,7 +1755,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
params
->
getDeviceBuffer
());
computeBornSumKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
13
*
sizeof
(
cl_float
),
NULL
);
computeBornSumKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
1
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float
),
NULL
);
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getForceBufferFlags
().
getDeviceBuffer
());
if
(
nb
.
getUseCutoff
())
{
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
...
@@ -1773,7 +1774,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
bornForce
->
getDeviceBuffer
());
force1Kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
13
*
sizeof
(
cl_float
),
NULL
);
force1Kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
1
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
mm_float4
),
NULL
);
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getForceBufferFlags
().
getDeviceBuffer
());
if
(
nb
.
getUseCutoff
())
{
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
...
@@ -1805,14 +1805,14 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornForceKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
obcChain
->
getDeviceBuffer
());
}
if
(
nb
.
getUseCutoff
())
{
computeBornSumKernel
.
setArg
<
mm_float4
>
(
8
,
cl
.
getPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
9
,
cl
.
getInvPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
10
,
cl
.
getPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
1
1
,
cl
.
getInvPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
7
,
cl
.
getPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
8
,
cl
.
getInvPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
9
,
cl
.
getPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
1
0
,
cl
.
getInvPeriodicBoxSize
());
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
computeBornSumKernel
.
setArg
<
cl_uint
>
(
10
,
maxTiles
);
force1Kernel
.
setArg
<
cl_uint
>
(
1
2
,
maxTiles
);
force1Kernel
.
setArg
<
cl_uint
>
(
1
1
,
maxTiles
);
}
}
cl
.
executeKernel
(
computeBornSumKernel
,
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
...
...
@@ -2148,7 +2148,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
}
map
<
string
,
string
>
replacements
;
replacements
[
"COMPUTE_INTERACTION"
]
=
n2EnergySource
.
str
();
stringstream
extraArgs
,
loadLocal1
,
loadLocal2
,
clearLocal
,
load1
,
load2
,
recordDeriv
,
storeDerivs1
,
storeDerivs2
,
declareTemps
,
setTemps
;
stringstream
extraArgs
,
loadLocal1
,
loadLocal2
,
clearLocal
,
load1
,
load2
,
declare1
,
recordDeriv
,
storeDerivs1
,
storeDerivs2
,
declareTemps
,
setTemps
;
if
(
force
.
getNumGlobalParameters
()
>
0
)
extraArgs
<<
", __constant float* globals"
;
for
(
int
i
=
0
;
i
<
(
int
)
params
->
getBuffers
().
size
();
i
++
)
{
...
...
@@ -2174,7 +2174,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
string
index
=
intToString
(
i
+
1
);
extraArgs
<<
", __global "
<<
buffer
.
getType
()
<<
"* derivBuffers"
<<
index
<<
", __local "
<<
buffer
.
getType
()
<<
"* local_deriv"
<<
index
;
clearLocal
<<
"local_deriv"
<<
index
<<
"[localAtomIndex] = 0.0f;
\n
"
;
load
1
<<
buffer
.
getType
()
<<
" deriv"
<<
index
<<
"_1 = 0.0f;
\n
"
;
declare
1
<<
buffer
.
getType
()
<<
" deriv"
<<
index
<<
"_1 = 0.0f;
\n
"
;
load2
<<
buffer
.
getType
()
<<
" deriv"
<<
index
<<
"_2 = 0.0f;
\n
"
;
recordDeriv
<<
"local_deriv"
<<
index
<<
"[atom2] += deriv"
<<
index
<<
"_2;
\n
"
;
storeDerivs1
<<
"STORE_DERIVATIVE_1("
<<
index
<<
")"
;
...
...
@@ -2188,6 +2188,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
replacements
[
"CLEAR_LOCAL_DERIVATIVES"
]
=
clearLocal
.
str
();
replacements
[
"LOAD_ATOM1_PARAMETERS"
]
=
load1
.
str
();
replacements
[
"LOAD_ATOM2_PARAMETERS"
]
=
load2
.
str
();
replacements
[
"DECLARE_ATOM1_DERIVATIVES"
]
=
declare1
.
str
();
replacements
[
"RECORD_DERIVATIVE_2"
]
=
recordDeriv
.
str
();
replacements
[
"STORE_DERIVATIVES_1"
]
=
storeDerivs1
.
str
();
replacements
[
"STORE_DERIVATIVES_2"
]
=
storeDerivs2
.
str
();
...
...
@@ -2482,7 +2483,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
valueBuffers
->
getDeviceBuffer
());
pairValueKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float
),
NULL
);
pairValueKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float
),
NULL
);
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getForceBufferFlags
().
getDeviceBuffer
());
if
(
nb
.
getUseCutoff
())
{
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
pairValueKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
...
@@ -2531,7 +2531,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusionIndices
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getNonbondedUtilities
().
getExclusionRowIndices
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float4
),
NULL
);
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getForceBufferFlags
().
getDeviceBuffer
());
if
(
nb
.
getUseCutoff
())
{
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
pairEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
...
@@ -2609,14 +2608,14 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
globals
->
upload
(
globalParamValues
);
}
if
(
nb
.
getUseCutoff
())
{
pairValueKernel
.
setArg
<
mm_float4
>
(
1
1
,
cl
.
getPeriodicBoxSize
());
pairValueKernel
.
setArg
<
mm_float4
>
(
1
2
,
cl
.
getInvPeriodicBoxSize
());
pairEnergyKernel
.
setArg
<
mm_float4
>
(
1
2
,
cl
.
getPeriodicBoxSize
());
pairEnergyKernel
.
setArg
<
mm_float4
>
(
1
3
,
cl
.
getInvPeriodicBoxSize
());
pairValueKernel
.
setArg
<
mm_float4
>
(
1
0
,
cl
.
getPeriodicBoxSize
());
pairValueKernel
.
setArg
<
mm_float4
>
(
1
1
,
cl
.
getInvPeriodicBoxSize
());
pairEnergyKernel
.
setArg
<
mm_float4
>
(
1
1
,
cl
.
getPeriodicBoxSize
());
pairEnergyKernel
.
setArg
<
mm_float4
>
(
1
2
,
cl
.
getInvPeriodicBoxSize
());
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
pairValueKernel
.
setArg
<
cl_uint
>
(
1
3
,
maxTiles
);
pairEnergyKernel
.
setArg
<
cl_uint
>
(
1
4
,
maxTiles
);
pairValueKernel
.
setArg
<
cl_uint
>
(
1
2
,
maxTiles
);
pairEnergyKernel
.
setArg
<
cl_uint
>
(
1
3
,
maxTiles
);
}
}
cl
.
executeKernel
(
pairValueKernel
,
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
7943a339
...
...
@@ -37,7 +37,7 @@ using namespace std;
OpenCLNonbondedUtilities
::
OpenCLNonbondedUtilities
(
OpenCLContext
&
context
)
:
context
(
context
),
cutoff
(
-
1.0
),
useCutoff
(
false
),
numForceBuffers
(
0
),
exclusionIndices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactionFlags
(
NULL
),
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
)
,
forceBufferFlags
(
NULL
)
{
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
)
{
// Decide how many thread blocks and force buffers to use.
deviceIsCpu
=
(
context
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
);
...
...
@@ -48,8 +48,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
numForceBuffers
=
numForceThreadBlocks
;
}
else
if
(
context
.
getSIMDWidth
()
==
32
)
{
numForceThreadBlocks
=
2
*
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_COMPUTE_UNITS
>
();
forceThreadBlockSize
=
256
;
numForceThreadBlocks
=
4
*
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_COMPUTE_UNITS
>
();
forceThreadBlockSize
=
128
;
numForceBuffers
=
numForceThreadBlocks
;
}
else
{
...
...
@@ -82,8 +82,6 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
delete
blockCenter
;
if
(
blockBoundingBox
!=
NULL
)
delete
blockBoundingBox
;
if
(
forceBufferFlags
!=
NULL
)
delete
forceBufferFlags
;
}
void
OpenCLNonbondedUtilities
::
addInteraction
(
bool
usesCutoff
,
bool
usesPeriodic
,
bool
usesExclusions
,
double
cutoffDistance
,
const
vector
<
vector
<
int
>
>&
exclusionList
,
const
string
&
kernel
)
{
...
...
@@ -239,12 +237,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
interactionCount
->
upload
();
}
// Create the flags for reserving force buffers.
forceBufferFlags
=
new
OpenCLArray
<
cl_uint
>
(
context
,
numAtomBlocks
*
numForceThreadBlocks
,
"forceBufferFlags"
,
false
);
vector
<
cl_uint
>
forceBufferFlagsVec
(
forceBufferFlags
->
getSize
(),
0
);
forceBufferFlags
->
upload
(
forceBufferFlagsVec
);
// Create kernels.
forceKernel
=
createInteractionKernel
(
kernelSource
,
parameters
,
arguments
,
true
,
true
);
...
...
@@ -320,8 +312,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
if
(
cutoff
!=
-
1.0
)
{
if
(
useCutoff
)
{
forceKernel
.
setArg
<
mm_float4
>
(
1
3
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
4
,
context
.
getInvPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
2
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
3
,
context
.
getInvPeriodicBoxSize
());
}
context
.
executeKernel
(
forceKernel
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
}
...
...
@@ -343,14 +335,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
newSize
=
numTiles
;
delete
interactingTiles
;
interactingTiles
=
new
OpenCLArray
<
mm_ushort2
>
(
context
,
newSize
,
"interactingTiles"
);
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
1
,
interactingTiles
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl_uint
>
(
1
5
,
newSize
);
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
0
,
interactingTiles
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl_uint
>
(
1
4
,
newSize
);
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
interactingTiles
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
9
,
newSize
);
if
(
context
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
{
delete
interactionFlags
;
interactionFlags
=
new
OpenCLArray
<
cl_uint
>
(
context
,
deviceIsCpu
?
2
*
newSize
:
newSize
,
"interactionFlags"
);
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
6
,
interactionFlags
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
5
,
interactionFlags
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
interactingTiles
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
...
...
@@ -503,7 +495,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel
.
setArg
(
index
++
,
4
*
forceThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
+
numTiles
);
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
forceBufferFlags
->
getDeviceBuffer
());
if
(
useCutoff
)
{
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
interactingTiles
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
interactionCount
->
getDeviceBuffer
());
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.h
View file @
7943a339
...
...
@@ -196,12 +196,6 @@ public:
OpenCLArray
<
cl_uint
>&
getExclusionRowIndices
()
{
return
*
exclusionRowIndices
;
}
/**
* Get the array which contains flags for reserving force buffers.
*/
OpenCLArray
<
cl_uint
>&
getForceBufferFlags
()
{
return
*
forceBufferFlags
;
}
/**
* Get the index of the first tile this context is responsible for processing.
*/
...
...
@@ -245,7 +239,6 @@ private:
OpenCLArray
<
cl_uint
>*
interactionCount
;
OpenCLArray
<
mm_float4
>*
blockCenter
;
OpenCLArray
<
mm_float4
>*
blockBoundingBox
;
OpenCLArray
<
cl_uint
>*
forceBufferFlags
;
std
::
vector
<
std
::
vector
<
int
>
>
atomExclusions
;
std
::
vector
<
ParameterInfo
>
parameters
;
std
::
vector
<
ParameterInfo
>
arguments
;
...
...
platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
View file @
7943a339
...
...
@@ -8,7 +8,7 @@
__kernel
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempBuffer,
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
...
...
platforms/opencl/src/kernels/customGBEnergyN2_default.cl
View file @
7943a339
...
...
@@ -9,7 +9,7 @@
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempForceBuffer,
__global
unsigned
int*
forceBufferFlags,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempForceBuffer,
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
#
else
...
...
platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
View file @
7943a339
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
define
TILE_SIZE
32
#
define
STORE_DERIVATIVE_1
(
INDEX
)
derivBuffers##INDEX[offset1]
+=
deriv##INDEX##_1
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
derivBuffers##INDEX[offset2]
+=
local_deriv##INDEX[get_local_id
(
0
)
]
;
/**
*
Mark
that
a
block
in
the
force
buffer
is
in
use.
*/
void
reserveBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
while
(
atom_cmpxchg
(
&forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
],
0
,
1
)
!=
0
)
;
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
}
/**
*
Mark
that
a
block
in
the
force
buffer
is
no
longer
in
use.
*/
void
releaseBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
]
=
0
;
}
#
define
STORE_DERIVATIVE_1
(
INDEX
)
derivBuffers##INDEX[offset]
+=
deriv##INDEX##_1
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
derivBuffers##INDEX[offset]
+=
local_deriv##INDEX[get_local_id
(
0
)
]
;
/**
*
Compute
a
force
based
on
pair
interactions.
*/
__kernel
void
computeN2Energy
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__local
float4*
local_force,
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__global
unsigned
int*
exclusionRowIndices,
__local
float4*
tempBuffer,
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
...
...
@@ -48,132 +29,67 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
while
(
pos
<
end
)
{
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
do
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x,
y
;
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
unsigned int
atom1 = x*TILE_SIZE + tgx
;
unsigned
int
x,
y
;
float4
force
=
0.0f
;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
DECLARE_ATOM1_DERIVATIVES
if
(
pos
<
end
)
{
#
ifdef
USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Write results
reserveBuffer(x, forceBufferFlags);
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
STORE_DERIVATIVES_1
releaseBuffer(x, forceBufferFlags);
}
else {
// This is an off-diagonal tile.
// Locate the exclusion data for this tile.
const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_force[localAtomIndex] = 0.0f;
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags == 0) {
// No interactions in this tile.
}
else
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
{
// Compute the full set of interactions in this tile.
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
unsigned
int
tj
=
tgx
;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int
atom2
=
tbx+
t
j
;
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
...
...
@@ -187,44 +103,149 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
#endif
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+
t
j
;
atom2 = y*TILE_SIZE+j;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS
&& atom1 != atom2
) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy
+=
tempEnergy
;
energy +=
0.5f*
tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
atom2
=
tbx+tj
;
local_force[atom2].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
}
}
else {
// This is an off-diagonal tile.
//
Write
results
const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_force[localAtomIndex] = 0.0f;
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags == 0) {
// No interactions in this tile.
}
else
#endif
{
// Compute the full set of interactions in this tile.
reserveBuffer
(
x,
forceBufferFlags
)
;
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset1].xyz
+=
force.xyz
;
STORE_DERIVATIVES_1
releaseBuffer
(
x,
forceBufferFlags
)
;
reserveBuffer
(
y,
forceBufferFlags
)
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset2].xyz
+=
local_force[get_local_id
(
0
)
].xyz
;
STORE_DERIVATIVES_2
releaseBuffer
(
y,
forceBufferFlags
)
;
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
int
atom2
=
tbx+tj
;
float4
posq2
=
local_posq[atom2]
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
float
r
=
SQRT
(
r2
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+tj
;
float
dEdR
=
0.0f
;
float
tempEnergy
=
0.0f
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-r
;
}
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
atom2
=
tbx+tj
;
local_force[atom2].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
#
ifdef
USE_CUTOFF
}
#
endif
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
}
}
}
}
lasty
=
y
;
//
Write
results.
We
need
to
coordinate
between
warps
to
make
sure
no
two
of
them
//
ever
try
to
write
to
the
same
piece
of
memory
at
the
same
time.
int
writeX
=
(
pos
<
end
?
x
:
-1
)
;
int
writeY
=
(
pos
<
end
&&
x
!=
y
?
y
:
-1
)
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
writeX,
writeY
)
;
bool
done
=
false
;
int
doneIndex
=
0
;
int
checkIndex
=
0
;
while
(
true
)
{
//
See
if
any
warp
still
needs
to
write
its
data.
bool
allDone
=
true
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
while
(
doneIndex
<
WARPS_PER_GROUP
&&
allDone
)
{
if
(
reservedBlocks[doneIndex].x
!=
-1
)
allDone
=
false
;
else
doneIndex++
;
}
if
(
allDone
)
break
;
if
(
!done
)
{
//
See
whether
this
warp
can
write
its
data.
This
requires
that
no
previous
warp
//
is
trying
to
write
to
the
same
block
of
the
buffer.
bool
canWrite
=
(
writeX
!=
-1
)
;
while
(
checkIndex
<
localGroupIndex
&&
canWrite
)
{
if
((
reservedBlocks[checkIndex].x
==
x
|
| reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y |
|
reservedBlocks[checkIndex].y
==
y
)))
canWrite
=
false
;
else
checkIndex++
;
}
if
(
canWrite
)
{
//
Write
the
data
to
global
memory,
then
mark
this
warp
as
done.
if
(
writeX
>
-1
)
{
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
force.xyz
;
STORE_DERIVATIVES_1
}
if
(
writeY
>
-1
)
{
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
local_force[get_local_id
(
0
)
].xyz
;
STORE_DERIVATIVES_2
}
done
=
true
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
-1
,
-1
)
;
}
}
}
pos++
;
}
}
while
(
pos
<
end
)
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
platforms/opencl/src/kernels/customGBValueN2_cpu.cl
View file @
7943a339
...
...
@@ -6,7 +6,7 @@
__kernel
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__global
float*
global_value,
__local
float*
local_value,
__local
float*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
...
...
platforms/opencl/src/kernels/customGBValueN2_default.cl
View file @
7943a339
...
...
@@ -7,7 +7,7 @@
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__global
float*
global_value,
__local
float*
local_value,
__local
float*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
#
else
...
...
platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
View file @
7943a339
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
define
TILE_SIZE
32
/**
*
Mark
that
a
block
in
the
value
buffer
is
in
use.
*/
void
reserveBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
while
(
atom_cmpxchg
(
&forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
],
0
,
1
)
!=
0
)
;
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
}
/**
*
Mark
that
a
block
in
the
value
buffer
is
no
longer
in
use.
*/
void
releaseBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
]
=
0
;
}
/**
*
Compute
a
value
based
on
pair
interactions.
*/
__kernel
void
computeN2Value
(
__global
float4*
posq,
__local
float4*
local_posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__global
float*
global_value,
__local
float*
local_value,
__local
float*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
...
...
@@ -46,222 +27,262 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
while
(
pos
<
end
)
{
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
do
{
//
Extract
the
coordinates
of
this
tile
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
unsigned
int
x,
y
;
float
value
=
0.0f
;
if
(
pos
<
end
)
{
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
}
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx;
float value = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
bool hasExclusions = false;
#endif
if (x == y) {
// This tile is on the diagonal.
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
if (r2 < CUTOFF_SQUARED) {
#endif
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
COMPUTE_VALUE
}
value += tempValue1;
#ifdef USE_CUTOFF
}
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
excl >>= 1;
#endif
}
}
else {
// This is an off-diagonal tile.
// Write results
reserveBuffer(x, forceBufferFlags);
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset] += value;
releaseBuffer(x, forceBufferFlags);
}
else {
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j];
const unsigned int localAtomIndex = get_local_id(0);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_value[get_local_id(0)] = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j];
const unsigned int localAtomIndex = get_local_id(0);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
else {
// Compute only a subset of the interactions in this tile.
local_value[get_local_id(0)] = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
if (r2 < CUTOFF_SQUARED) {
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_VALUE
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
if (r2 < CUTOFF_SQUARED) {
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_VALUE
}
value += tempValue1;
}
value += tempValue1;
}
tempBuffer[get_local_id(0)] = tempValue2;
tempBuffer[get_local_id(0)] = tempValue2;
// Sum the forces on atom2.
// Sum the forces on atom2.
if (tgx % 2 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
if (tgx % 8 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
if (tgx == 0)
local_value[tbx+j] += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
if (tgx % 2 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
if (tgx % 8 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
if (tgx == 0)
local_value[tbx+j] += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
}
}
}
}
}
else
else
#endif
{
// Compute the full set of interactions in this tile.
{
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
int
atom2
=
tbx+tj
;
float4
posq2
=
local_posq[atom2]
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
int
atom2
=
tbx+tj
;
float4
posq2
=
local_posq[atom2]
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
float
r
=
SQRT
(
r2
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+tj
;
float
tempValue1
=
0.0f
;
float
tempValue2
=
0.0f
;
float
r
=
SQRT
(
r2
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+tj
;
float
tempValue1
=
0.0f
;
float
tempValue2
=
0.0f
;
#
ifdef
USE_EXCLUSIONS
if
(
!isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if
(
!isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#
endif
COMPUTE_VALUE
}
value
+=
tempValue1
;
local_value[tbx+tj]
+=
tempValue2
;
COMPUTE_VALUE
}
value
+=
tempValue1
;
local_value[tbx+tj]
+=
tempValue2
;
#
ifdef
USE_CUTOFF
}
}
#
endif
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
excl
>>=
1
;
#
endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
}
}
}
}
//
Write
results.
We
need
to
coordinate
between
warps
to
make
sure
no
two
of
them
//
ever
try
to
write
to
the
same
piece
of
memory
at
the
same
time.
int
writeX
=
(
pos
<
end
?
x
:
-1
)
;
int
writeY
=
(
pos
<
end
&&
x
!=
y
?
y
:
-1
)
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
writeX,
writeY
)
;
bool
done
=
false
;
int
doneIndex
=
0
;
int
checkIndex
=
0
;
while
(
true
)
{
//
See
if
any
warp
still
needs
to
write
its
data.
//
Write
results
bool
allDone
=
true
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
while
(
doneIndex
<
WARPS_PER_GROUP
&&
allDone
)
{
if
(
reservedBlocks[doneIndex].x
!=
-1
)
allDone
=
false
;
else
doneIndex++
;
}
if
(
allDone
)
break
;
if
(
!done
)
{
//
See
whether
this
warp
can
write
its
data.
This
requires
that
no
previous
warp
//
is
trying
to
write
to
the
same
block
of
the
buffer.
reserveBuffer
(
x,
forceBufferFlags
)
;
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset1]
+=
value
;
releaseBuffer
(
x,
forceBufferFlags
)
;
reserveBuffer
(
y,
forceBufferFlags
)
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset2]
+=
local_value[get_local_id
(
0
)
]
;
releaseBuffer
(
y,
forceBufferFlags
)
;
bool
canWrite
=
(
writeX
!=
-1
)
;
while
(
checkIndex
<
localGroupIndex
&&
canWrite
)
{
if
((
reservedBlocks[checkIndex].x
==
x
|
| reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y |
|
reservedBlocks[checkIndex].y
==
y
)))
canWrite
=
false
;
else
checkIndex++
;
}
if
(
canWrite
)
{
//
Write
the
data
to
global
memory,
then
mark
this
warp
as
done.
if
(
writeX
>
-1
)
{
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset]
+=
value
;
}
if
(
writeY
>
-1
)
{
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset]
+=
local_value[get_local_id
(
0
)
]
;
}
done
=
true
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
-1
,
-1
)
;
}
}
}
lasty
=
y
;
pos++
;
}
}
while
(
pos
<
end
)
;
}
platforms/opencl/src/kernels/gbsaObc_cpu.cl
View file @
7943a339
...
...
@@ -15,7 +15,7 @@ typedef struct {
*/
__kernel
void
computeBornSum
(
__global
float*
global_bornSum,
__global
float4*
posq,
__global
float2*
global_params,
__local
AtomData*
localData,
__local
float*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__local
AtomData*
localData,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
)
{
#
else
...
...
@@ -192,7 +192,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__local AtomData* localData, __local float4* tempBuffer,
__global unsigned int* forceBufferFlags,
__local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
#else
...
...
platforms/opencl/src/kernels/gbsaObc_default.cl
View file @
7943a339
...
...
@@ -16,7 +16,7 @@ typedef struct {
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeBornSum
(
__global
float*
global_bornSum,
__global
float4*
posq,
__global
float2*
global_params,
__local
AtomData*
localData,
__local
float*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__local
AtomData*
localData,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
)
{
#
else
...
...
@@ -203,7 +203,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__local AtomData* localData, __local float4* tempBuffer,
__global unsigned int* forceBufferFlags,
__local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else
...
...
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
View file @
7943a339
...
...
@@ -11,30 +11,11 @@ typedef struct {
float
bornForce
;
}
AtomData
;
/**
*
Mark
that
a
block
in
the
force
buffer
is
in
use.
*/
void
reserveBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
while
(
atom_cmpxchg
(
&forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
],
0
,
1
)
!=
0
)
;
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
}
/**
*
Mark
that
a
block
in
the
force
buffer
is
no
longer
in
use.
*/
void
releaseBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
]
=
0
;
}
/**
*
Compute
the
Born
sum.
*/
__kernel
void
computeBornSum
(
__global
float*
global_bornSum,
__global
float4*
posq,
__global
float2*
global_params,
__local
AtomData*
localData,
__local
float*
tempBuffer,
__global
unsigned
int*
forceBufferFlags,
__local
AtomData*
localData,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
)
{
#
else
...
...
@@ -51,231 +32,271 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
#
endif
unsigned
int
lasty
=
0xFFFFFFFF
;
while
(
pos
<
end
)
{
__local
int2
reservedBlocks[WARPS_PER_GROUP]
;
do
{
//
Extract
the
coordinates
of
this
tile
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
unsigned
int
x,
y
;
float
bornSum
=
0.0f
;
if
(
pos
<
end
)
{
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
}
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx;
float bornSum = 0.0f;
float4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
if (x == y) {
// This tile is on the diagonal.
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
float invR = RSQRT(r2);
float r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
}
}
}
else {
// This is an off-diagonal tile.
// Write results
reserveBuffer(x, forceBufferFlags);
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
releaseBuffer(x, forceBufferFlags);
}
else {
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
}
localData[get_local_id(0)].bornSum = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
if (flags == 0) {
// No interactions in this tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
}
else {
// Compute only a subset of the interactions in this tile.
localData[get_local_id(0)].bornSum = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
tempBuffer[get_local_id(0)] = 0.0f;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
tempBuffer[get_local_id(0)] = 0.0f;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
float invR = RSQRT(r2);
float r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
float rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
tempBuffer[get_local_id(0)] = term;
}
}
float rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
tempBuffer[get_local_id(0)] = term;
}
}
// Sum the forces on atom j.
// Sum the forces on atom j.
if (tgx % 2 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
if (tgx % 8 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
if (tgx == 0)
localData[tbx+j].bornSum += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
if (tgx % 2 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
if (tgx % 8 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
if (tgx == 0)
localData[tbx+j].bornSum += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
}
}
}
}
}
else
else
#endif
{
// Compute the full set of interactions in this tile.
{
// Compute the full set of interactions in this tile.
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0.0f);
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
float rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
localData[tbx+tj].bornSum += term;
float invR = RSQRT(r2);
float r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
float rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
localData[tbx+tj].bornSum += term;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
bool done = false;
int doneIndex = 0;
int checkIndex = 0;
while (true) {
// See if any warp still needs to write its data.
bool allDone = true;
barrier(CLK_LOCAL_MEM_FENCE);
while (doneIndex < WARPS_PER_GROUP && allDone) {
if (reservedBlocks[doneIndex].x != -1)
allDone = false;
else
doneIndex++;
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
// Write results
bool canWrite = (writeX != -1);
while (checkIndex < localGroupIndex && canWrite) {
if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
canWrite = false;
else
checkIndex++;
}
if (canWrite) {
// Write the data to global memory, then mark this warp as done.
reserveBuffer(x, forceBufferFlags);
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset1] += bornSum;
releaseBuffer(x, forceBufferFlags);
reserveBuffer(y, forceBufferFlags);
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset2] += localData[get_local_id(0)].bornSum;
releaseBuffer(y, forceBufferFlags);
if (writeX > -1) {
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
}
if (writeY > -1) {
const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += localData[get_local_id(0)].bornSum;
}
done = true;
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(-1, -1);
}
}
}
lasty = y;
pos++;
}
}
while (pos < end);
}
/**
...
...
@@ -284,7 +305,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__local AtomData* localData, __local float4* tempBuffer,
__global unsigned int* forceBufferFlags,
__local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
#else
...
...
@@ -302,113 +323,171 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
#endif
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
while (pos < end) {
__local int2 reservedBlocks[WARPS_PER_GROUP];
do {
// Extract the coordinates of this tile
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y;
float4 force = 0.0f;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
}
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
float bornRadius1 = global_bornRadii[atom1];
if (x == y) {
// This tile is on the diagonal.
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 posq1 = posq[atom1];
float bornRadius1 = global_bornRadii[atom1];
if (x == y) {
// This tile is on the diagonal.
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
float bornRadius2 = localData[tbx+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
float bornRadius2 = localData[tbx+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
if (r2 > CUTOFF_SQUARED) {
dEdR = 0.0f;
tempEnergy = 0.0f;
dGpol_dalpha2_ij = 0.0f;
}
if (r2 > CUTOFF_SQUARED) {
dEdR = 0.0f;
tempEnergy = 0.0f;
dGpol_dalpha2_ij = 0.0f;
}
#endif
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
}
}
}
else {
// This is an off-diagonal tile.
// Write results
reserveBuffer(x, forceBufferFlags);
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
global_bornForce[offset] += force.w;
releaseBuffer(x, forceBufferFlags);
}
else {
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
localData[get_local_id(0)].bornRadius = global_bornRadii[j];
}
localData[get_local_id(0)].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
localData[get_local_id(0)].bornRadius = global_bornRadii[j];
}
localData[get_local_id(0)].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
float bornRadius2 = localData[tbx+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
if (flags == 0) {
// No interactions in this tile.
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
#else
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS) {
#endif
dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f;
tempEnergy = 0.0f;
}
energy += tempEnergy;
force.w += dGpol_dalpha2_ij*bornRadius2;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
tempBuffer[get_local_id(0)] = (float4) (delta.xyz, dGpol_dalpha2_ij*bornRadius1);
// Sum the forces on atom j.
if (tgx % 2 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
if (tgx % 8 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
if (tgx == 0) {
float4 sum = tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
localData[tbx+j].fx += sum.x;
localData[tbx+j].fy += sum.y;
localData[tbx+j].fz += sum.z;
localData[tbx+j].fw += sum.w;
}
}
}
}
}
else {
// Compute only a subset of the interactions in this tile.
else
#endif
{
// Compute the full set of interactions in this tile.
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (
(flags&(1<<j)) != 0
) {
float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
if (
atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS
) {
float4 posq2 = (float4) (localData[tbx+
t
j].x, localData[tbx+
t
j].y, localData[tbx+
t
j].z, localData[tbx+
t
j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
...
...
@@ -418,7 +497,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
float bornRadius2 = localData[tbx+j].bornRadius;
float bornRadius2 = localData[tbx+
t
j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij);
...
...
@@ -429,104 +508,83 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
#else
if (atom1 >= NUM_ATOMS |
|
y*TILE_SIZE+j
>=
NUM_ATOMS
)
{
#
endif
if (r2 > CUTOFF_SQUARED) {
dEdR = 0.0f;
tempEnergy = 0.0f;
dGpol_dalpha2_ij = 0.0f;
tempEnergy
=
0.0f
;
}
energy
+=
tempEnergy
;
#endif
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
tempBuffer[get_local_id
(
0
)
]
=
(
float4
)
(
delta.xyz,
dGpol_dalpha2_ij*bornRadius1
)
;
//
Sum
the
forces
on
atom
j.
if
(
tgx
%
2
==
0
)
tempBuffer[get_local_id
(
0
)
]
+=
tempBuffer[get_local_id
(
0
)
+1]
;
if
(
tgx
%
4
==
0
)
tempBuffer[get_local_id
(
0
)
]
+=
tempBuffer[get_local_id
(
0
)
+2]
;
if
(
tgx
%
8
==
0
)
tempBuffer[get_local_id
(
0
)
]
+=
tempBuffer[get_local_id
(
0
)
+4]
;
if
(
tgx
%
16
==
0
)
tempBuffer[get_local_id
(
0
)
]
+=
tempBuffer[get_local_id
(
0
)
+8]
;
if
(
tgx
==
0
)
{
float4
sum
=
tempBuffer[get_local_id
(
0
)
]
+
tempBuffer[get_local_id
(
0
)
+16]
;
localData[tbx+j].fx
+=
sum.x
;
localData[tbx+j].fy
+=
sum.y
;
localData[tbx+j].fz
+=
sum.z
;
localData[tbx+j].fw
+=
sum.w
;
}
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
else
#
endif
{
//
Compute
the
full
set
of
interactions
in
this
tile.
}
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
bool done = false;
int doneIndex = 0;
int checkIndex = 0;
while (true) {
// See if any warp still needs to write its data.
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+tj
<
NUM_ATOMS
)
{
float4
posq2
=
(
float4
)
(
localData[tbx+tj].x,
localData[tbx+tj].y,
localData[tbx+tj].z,
localData[tbx+tj].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
float
bornRadius2
=
localData[tbx+tj].bornRadius
;
float
alpha2_ij
=
bornRadius1*bornRadius2
;
float
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
float
expTerm
=
EXP
(
-D_ij
)
;
float
denominator2
=
r2
+
alpha2_ij*expTerm
;
float
denominator
=
SQRT
(
denominator2
)
;
float
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
*RECIP
(
denominator
)
;
float
Gpol
=
tempEnergy*RECIP
(
denominator2
)
;
float
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
float
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
#
ifdef
USE_CUTOFF
if
(
r2
>
CUTOFF_SQUARED
)
{
dEdR
=
0.0f
;
tempEnergy
=
0.0f
;
dGpol_dalpha2_ij
=
0.0f
;
}
#
endif
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
localData[tbx+tj].fx
+=
delta.x
;
localData[tbx+tj].fy
+=
delta.y
;
localData[tbx+tj].fz
+=
delta.z
;
localData[tbx+tj].fw
+=
dGpol_dalpha2_ij*bornRadius1
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
}
bool allDone = true;
barrier(CLK_LOCAL_MEM_FENCE);
while (doneIndex < WARPS_PER_GROUP && allDone) {
if (reservedBlocks[doneIndex].x != -1)
allDone = false;
else
doneIndex++;
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
//
Write
results
bool canWrite = (writeX != -1);
while (checkIndex < localGroupIndex && canWrite) {
if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y |
|
reservedBlocks[checkIndex].y
==
y
)
))
canWrite
=
false
;
else
checkIndex++
;
}
if
(
canWrite
)
{
//
Write
the
data
to
global
memory,
then
mark
this
warp
as
done.
reserveBuffer
(
x,
forceBufferFlags
)
;
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset1].xyz
+=
force.xyz
;
global_bornForce[offset1]
+=
force.w
;
releaseBuffer
(
x,
forceBufferFlags
)
;
reserveBuffer
(
y,
forceBufferFlags
)
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset2]
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0
)
;
global_bornForce[offset2]
+=
localData[get_local_id
(
0
)
].fw
;
releaseBuffer
(
y,
forceBufferFlags
)
;
if
(
writeX
>
-1
)
{
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
force.xyz
;
global_bornForce[offset]
+=
force.w
;
}
if
(
writeY
>
-1
)
{
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset]
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0.0f
)
;
global_bornForce[offset]
+=
localData[get_local_id
(
0
)
].fw
;
}
done
=
true
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
-1
,
-1
)
;
}
}
}
lasty
=
y
;
pos++
;
}
}
while
(
pos
<
end
)
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
platforms/opencl/src/kernels/nonbonded_cpu.cl
View file @
7943a339
...
...
@@ -13,7 +13,7 @@ typedef struct {
__kernel
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
__global
unsigned
int*
forceBufferFlags,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
...
...
platforms/opencl/src/kernels/nonbonded_default.cl
View file @
7943a339
...
...
@@ -14,7 +14,7 @@ typedef struct {
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
__global
unsigned
int*
forceBufferFlags,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
...
...
platforms/opencl/src/kernels/nonbonded_nvidia.cl
View file @
7943a339
...
...
@@ -8,31 +8,12 @@ typedef struct {
ATOM_PARAMETER_DATA
}
AtomData
;
/**
*
Mark
that
a
block
in
the
force
buffer
is
in
use.
*/
void
reserveBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
while
(
atom_cmpxchg
(
&forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
],
0
,
1
)
!=
0
)
;
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
}
/**
*
Mark
that
a
block
in
the
force
buffer
is
no
longer
in
use.
*/
void
releaseBuffer
(
unsigned
int
block,
__global
unsigned
int*
forceBufferFlags
)
{
mem_fence
(
CLK_GLOBAL_MEM_FENCE
)
;
if
((
get_local_id
(
0
)
&
(
TILE_SIZE-1
))
==
0
)
forceBufferFlags[block+NUM_BLOCKS*get_group_id
(
0
)
]
=
0
;
}
/**
*
Compute
nonbonded
interactions.
*/
__kernel
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float*
tempBuffer,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
__global
unsigned
int*
forceBufferFlags,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
...
...
@@ -53,264 +34,306 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
while
(
pos
<
end
)
{
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
do
{
//
Extract
the
coordinates
of
this
tile
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
unsigned
int
x,
y
;
float4
force
=
0.0f
;
if
(
pos
<
end
)
{
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
}
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
bool hasExclusions = false;
#endif
if (x == y) {
// This tile is on the diagonal.
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
const unsigned int localAtomIndex = get_local_id(0);
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
int atom2 = tbx+j;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
float dEdR = 0.0f;
#else
float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
#endif
float tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
float tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC
force.xyz -= delta.xyz*dEdR;
force.xyz -= delta.xyz*dEdR;
#else
force.xyz -= dEdR1.xyz;
force.xyz -= dEdR1.xyz;
#endif
excl >>= 1;
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
}
else {
// This is an off-diagonal tile.
// Write results
reserveBuffer(x, forceBufferFlags);
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
releaseBuffer(x, forceBufferFlags);
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData[localAtomIndex].fx = 0.0f;
localData[localAtomIndex].fy = 0.0f;
localData[localAtomIndex].fz = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
else {
// Compute only a subset of the interactions in this tile.
localData[localAtomIndex].fx = 0.0f;
localData[localAtomIndex].fy = 0.0f;
localData[localAtomIndex].fz = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
bool isExcluded = false;
int atom2 = tbx+j;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
bool isExcluded = false;
int atom2 = tbx+j;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
float dEdR = 0.0f;
#else
float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
#endif
float tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
int bufferIndex = 3*get_local_id(0);
float tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
int bufferIndex = 3*get_local_id(0);
#ifdef USE_SYMMETRIC
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
tempBuffer[bufferIndex] = delta.x;
tempBuffer[bufferIndex+1] = delta.y;
tempBuffer[bufferIndex+2] = delta.z;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
tempBuffer[bufferIndex] = delta.x;
tempBuffer[bufferIndex+1] = delta.y;
tempBuffer[bufferIndex+2] = delta.z;
#else
force.xyz -= dEdR1.xyz;
tempBuffer[bufferIndex] = dEdR2.x;
tempBuffer[bufferIndex+1] = dEdR2.y;
tempBuffer[bufferIndex+2] = dEdR2.z;
force.xyz -= dEdR1.xyz;
tempBuffer[bufferIndex] = dEdR2.x;
tempBuffer[bufferIndex+1] = dEdR2.y;
tempBuffer[bufferIndex+2] = dEdR2.z;
#endif
// Sum the forces on atom2.
// Sum the forces on atom2.
if (tgx % 2 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5];
}
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+6];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+7];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+8];
}
if (tgx % 8 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+12];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+13];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+14];
}
if (tgx % 16 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+24];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+25];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+26];
}
if (tgx == 0) {
localData[tbx+j].fx += tempBuffer[bufferIndex] + tempBuffer[bufferIndex+48];
localData[tbx+j].fy += tempBuffer[bufferIndex+1] + tempBuffer[bufferIndex+49];
localData[tbx+j].fz += tempBuffer[bufferIndex+2] + tempBuffer[bufferIndex+50];
if (tgx % 2 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5];
}
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+6];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+7];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+8];
}
if (tgx % 8 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+12];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+13];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+14];
}
if (tgx % 16 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+24];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+25];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+26];
}
if (tgx == 0) {
localData[tbx+j].fx += tempBuffer[bufferIndex] + tempBuffer[bufferIndex+48];
localData[tbx+j].fy += tempBuffer[bufferIndex+1] + tempBuffer[bufferIndex+49];
localData[tbx+j].fz += tempBuffer[bufferIndex+2] + tempBuffer[bufferIndex+50];
}
}
}
}
}
}
else
else
#endif
{
// Compute the full set of interactions in this tile.
{
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
int
atom2
=
tbx+tj
;
float4
posq2
=
(
float4
)
(
localData[atom2].x,
localData[atom2].y,
localData[atom2].z,
localData[atom2].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
int
atom2
=
tbx+tj
;
float4
posq2
=
(
float4
)
(
localData[atom2].x,
localData[atom2].y,
localData[atom2].z,
localData[atom2].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+tj
;
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+tj
;
#
ifdef
USE_SYMMETRIC
float
dEdR
=
0.0f
;
float
dEdR
=
0.0f
;
#
else
float4
dEdR1
=
(
float4
)
0.0f
;
float4
dEdR2
=
(
float4
)
0.0f
;
float4
dEdR1
=
(
float4
)
0.0f
;
float4
dEdR2
=
(
float4
)
0.0f
;
#
endif
float
tempEnergy
=
0.0f
;
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
float
tempEnergy
=
0.0f
;
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
#
ifdef
USE_SYMMETRIC
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
localData[tbx+tj].fx
+=
delta.x
;
localData[tbx+tj].fy
+=
delta.y
;
localData[tbx+tj].fz
+=
delta.z
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
localData[tbx+tj].fx
+=
delta.x
;
localData[tbx+tj].fy
+=
delta.y
;
localData[tbx+tj].fz
+=
delta.z
;
#
else
force.xyz
-=
dEdR1.xyz
;
localData[tbx+tj].fx
+=
dEdR2.x
;
localData[tbx+tj].fy
+=
dEdR2.y
;
localData[tbx+tj].fz
+=
dEdR2.z
;
force.xyz
-=
dEdR1.xyz
;
localData[tbx+tj].fx
+=
dEdR2.x
;
localData[tbx+tj].fy
+=
dEdR2.y
;
localData[tbx+tj].fz
+=
dEdR2.z
;
#
endif
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
excl
>>=
1
;
#
endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
}
}
}
}
//
Write
results.
We
need
to
coordinate
between
warps
to
make
sure
no
two
of
them
//
ever
try
to
write
to
the
same
piece
of
memory
at
the
same
time.
int
writeX
=
(
pos
<
end
?
x
:
-1
)
;
int
writeY
=
(
pos
<
end
&&
x
!=
y
?
y
:
-1
)
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
writeX,
writeY
)
;
bool
done
=
false
;
int
doneIndex
=
0
;
int
checkIndex
=
0
;
while
(
true
)
{
//
See
if
any
warp
still
needs
to
write
its
data.
//
Write
results
bool
allDone
=
true
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
while
(
doneIndex
<
WARPS_PER_GROUP
&&
allDone
)
{
if
(
reservedBlocks[doneIndex].x
!=
-1
)
allDone
=
false
;
else
doneIndex++
;
}
if
(
allDone
)
break
;
if
(
!done
)
{
//
See
whether
this
warp
can
write
its
data.
This
requires
that
no
previous
warp
//
is
trying
to
write
to
the
same
block
of
the
buffer.
reserveBuffer
(
x,
forceBufferFlags
)
;
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset1].xyz
+=
force.xyz
;
releaseBuffer
(
x,
forceBufferFlags
)
;
reserveBuffer
(
y,
forceBufferFlags
)
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset2]
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0.0f
)
;
releaseBuffer
(
y,
forceBufferFlags
)
;
bool
canWrite
=
(
writeX
!=
-1
)
;
while
(
checkIndex
<
localGroupIndex
&&
canWrite
)
{
if
((
reservedBlocks[checkIndex].x
==
x
|
| reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y |
|
reservedBlocks[checkIndex].y
==
y
)))
canWrite
=
false
;
else
checkIndex++
;
}
if
(
canWrite
)
{
//
Write
the
data
to
global
memory,
then
mark
this
warp
as
done.
if
(
writeX
>
-1
)
{
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
force.xyz
;
}
if
(
writeY
>
-1
)
{
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset]
+=
(
float4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0.0f
)
;
}
done
=
true
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
-1
,
-1
)
;
}
}
}
lasty
=
y
;
pos++
;
}
}
while
(
pos
<
end
)
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment