Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
38a60fbf
"openmmapi/vscode:/vscode.git/clone" did not exist on "2507ee7f2f30842c1ef729c9a57b7fbdf2f9bf41"
Commit
38a60fbf
authored
Oct 30, 2010
by
Peter Eastman
Browse files
Created GBSA kernels optimized for CPU
parent
d28df828
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
379 additions
and
9 deletions
+379
-9
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+14
-7
platforms/opencl/src/kernels/gbsaObc_cpu.cl
platforms/opencl/src/kernels/gbsaObc_cpu.cl
+365
-0
platforms/opencl/src/kernels/gbsaObc_default.cl
platforms/opencl/src/kernels/gbsaObc_default.cl
+0
-1
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+0
-1
No files found.
platforms/opencl/src/OpenCLKernels.cpp
View file @
38a60fbf
...
...
@@ -1683,6 +1683,7 @@ void OpenCLCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOB
double
OpenCLCalcGBSAOBCForceKernel
::
execute
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
)
{
OpenCLNonbondedUtilities
&
nb
=
cl
.
getNonbondedUtilities
();
bool
deviceIsCpu
=
(
cl
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
);
if
(
!
hasCreatedKernels
)
{
// These Kernels cannot be created in initialize(), because the OpenCLNonbondedUtilities has not been initialized yet then.
...
...
@@ -1700,21 +1701,27 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
defines
[
"NUM_ATOMS"
]
=
intToString
(
cl
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
intToString
(
cl
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
cl
.
getNumAtomBlocks
());
string
file
=
(
cl
.
getSIMDWidth
()
==
32
?
OpenCLKernelSources
::
gbsaObc_nvidia
:
OpenCLKernelSources
::
gbsaObc_default
);
string
file
;
if
(
deviceIsCpu
)
file
=
OpenCLKernelSources
::
gbsaObc_cpu
;
else
if
(
cl
.
getSIMDWidth
()
==
32
)
file
=
OpenCLKernelSources
::
gbsaObc_nvidia
;
else
file
=
OpenCLKernelSources
::
gbsaObc_default
;
cl
::
Program
program
=
cl
.
createProgram
(
file
,
defines
);
int
index
=
0
;
computeBornSumKernel
=
cl
::
Kernel
(
program
,
"computeBornSum"
);
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
bornSum
->
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getPosq
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
params
->
getDeviceBuffer
());
computeBornSumKernel
.
setArg
(
index
++
,
OpenCLContext
::
ThreadBlockSize
*
13
*
sizeof
(
cl_float
),
NULL
);
computeBornSumKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
OpenCLContext
::
ThreadBlockSize
)
*
13
*
sizeof
(
cl_float
),
NULL
);
computeBornSumKernel
.
setArg
(
index
++
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
if
(
nb
.
getUseCutoff
())
{
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
index
+=
2
;
// The periodic box size arguments are set when the kernel is executed.
computeBornSumKernel
.
setArg
<
cl_uint
>
(
index
++
,
maxTiles
);
if
(
cl
.
getSIMDWidth
()
==
32
)
if
(
cl
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
}
else
...
...
@@ -1726,14 +1733,14 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getPosq
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
bornRadii
->
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
bornForce
->
getDeviceBuffer
());
force1Kernel
.
setArg
(
index
++
,
OpenCLContext
::
ThreadBlockSize
*
13
*
sizeof
(
cl_float
),
NULL
);
force1Kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
OpenCLContext
::
ThreadBlockSize
)
*
13
*
sizeof
(
cl_float
),
NULL
);
force1Kernel
.
setArg
(
index
++
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
mm_float4
),
NULL
);
if
(
nb
.
getUseCutoff
())
{
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
index
+=
2
;
// The periodic box size arguments are set when the kernel is executed.
force1Kernel
.
setArg
<
cl_uint
>
(
index
++
,
maxTiles
);
if
(
cl
.
getSIMDWidth
()
==
32
)
if
(
cl
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
}
else
...
...
@@ -1770,9 +1777,9 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
}
}
int
numTiles
=
cl
.
getNumAtomBlocks
()
*
(
cl
.
getNumAtomBlocks
()
+
1
)
/
2
;
cl
.
executeKernel
(
computeBornSumKernel
,
numTiles
*
OpenCLContext
::
TileSize
);
cl
.
executeKernel
(
computeBornSumKernel
,
numTiles
*
OpenCLContext
::
TileSize
,
deviceIsCpu
?
1
:
-
1
);
cl
.
executeKernel
(
reduceBornSumKernel
,
cl
.
getPaddedNumAtoms
());
cl
.
executeKernel
(
force1Kernel
,
numTiles
*
OpenCLContext
::
TileSize
);
cl
.
executeKernel
(
force1Kernel
,
numTiles
*
OpenCLContext
::
TileSize
,
deviceIsCpu
?
1
:
-
1
);
cl
.
executeKernel
(
reduceBornForceKernel
,
cl
.
getPaddedNumAtoms
());
return
0.0
;
}
...
...
platforms/opencl/src/kernels/gbsaObc_cpu.cl
0 → 100644
View file @
38a60fbf
#
define
TILE_SIZE
32
typedef
struct
{
float
x,
y,
z
;
float
q
;
float
fx,
fy,
fz,
fw
;
float
radius,
scaledRadius
;
float
bornSum
;
float
bornRadius
;
float
bornForce
;
}
AtomData
;
/**
*
Compute
the
Born
sum.
*/
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeBornSum
(
__global
float*
global_bornSum,
__global
float4*
posq,
__global
float2*
global_params,
__local
AtomData*
localData,
__local
float*
tempBuffer,
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
)
{
#
else
unsigned
int
numTiles
)
{
#
endif
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
#
else
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
unsigned
int
lasty
=
0xFFFFFFFF
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x,
y
;
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y++
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
}
//
Load
the
data
for
this
tile
if
we
don
't
already
have
it
cached.
if
(
lasty
!=
y
)
{
for
(
int
localAtomIndex
=
0
; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned
int
j
=
y*TILE_SIZE
+
localAtomIndex
;
float4
tempPosq
=
posq[j]
;
localData[localAtomIndex].x
=
tempPosq.x
;
localData[localAtomIndex].y
=
tempPosq.y
;
localData[localAtomIndex].z
=
tempPosq.z
;
localData[localAtomIndex].q
=
tempPosq.w
;
float2
tempParams
=
global_params[j]
;
localData[localAtomIndex].radius
=
tempParams.x
;
localData[localAtomIndex].scaledRadius
=
tempParams.y
;
}
}
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
float
bornSum
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float2
params1
=
global_params[atom1]
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
float4
posq2
=
(
float4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
float
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+j
<
NUM_ATOMS
)
{
#
endif
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
float2
params2
=
(
float2
)
(
localData[j].radius,
localData[j].scaledRadius
)
;
float
rScaledRadiusJ
=
r+params2.y
;
if
((
j
!=
tgx
)
&&
(
params1.x
<
rScaledRadiusJ
))
{
float
l_ij
=
RECIP
(
max
(
params1.x,
fabs
(
r-params2.y
)))
;
float
u_ij
=
RECIP
(
rScaledRadiusJ
)
;
float
l_ij2
=
l_ij*l_ij
;
float
u_ij2
=
u_ij*u_ij
;
float
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
))
;
bornSum
+=
l_ij
-
u_ij
+
0.25f*r*
(
u_ij2-l_ij2
)
+
(
0.50f*invR*ratio
)
+
(
0.25f*params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
)
;
if
(
params1.x
<
params2.x-r
)
bornSum
+=
2.0f*
(
RECIP
(
params1.x
)
-l_ij
)
;
}
}
}
//
Write
results.
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_bornSum[offset]
+=
bornSum
;
}
}
else
{
//
This
is
an
off-diagonal
tile.
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++)
localData[tgx].bornSum
=
0.0f
;
//
Compute
the
full
set
of
interactions
in
this
tile.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
float
bornSum
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float2
params1
=
global_params[atom1]
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
float4
posq2
=
(
float4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
float
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+j
<
NUM_ATOMS
)
{
#
endif
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
float2
params2
=
(
float2
)
(
localData[j].radius,
localData[j].scaledRadius
)
;
float
rScaledRadiusJ
=
r+params2.y
;
if
(
params1.x
<
rScaledRadiusJ
)
{
float
l_ij
=
RECIP
(
max
(
params1.x,
fabs
(
r-params2.y
)))
;
float
u_ij
=
RECIP
(
rScaledRadiusJ
)
;
float
l_ij2
=
l_ij*l_ij
;
float
u_ij2
=
u_ij*u_ij
;
float
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
))
;
bornSum
+=
l_ij
-
u_ij
+
0.25f*r*
(
u_ij2-l_ij2
)
+
(
0.50f*invR*ratio
)
+
(
0.25f*params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
)
;
if
(
params1.x
<
params2.x-r
)
bornSum
+=
2.0f*
(
RECIP
(
params1.x
)
-l_ij
)
;
}
float
rScaledRadiusI
=
r+params1.y
;
if
(
params2.x
<
rScaledRadiusI
)
{
float
l_ij
=
RECIP
(
max
(
params2.x,
fabs
(
r-params1.y
)))
;
float
u_ij
=
RECIP
(
rScaledRadiusI
)
;
float
l_ij2
=
l_ij*l_ij
;
float
u_ij2
=
u_ij*u_ij
;
float
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
))
;
float
term
=
l_ij
-
u_ij
+
0.25f*r*
(
u_ij2-l_ij2
)
+
(
0.50f*invR*ratio
)
+
(
0.25f*params1.y*params1.y*invR
)
*
(
l_ij2-u_ij2
)
;
if
(
params2.x
<
params1.x-r
)
term
+=
2.0f*
(
RECIP
(
params2.x
)
-l_ij
)
;
localData[j].bornSum
+=
term
;
}
}
}
//
Write
results
for
atom1.
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_bornSum[offset]
+=
localData[tgx].bornSum
;
}
}
//
Write
results
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
offset
=
y*TILE_SIZE+tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_bornSum[offset]
+=
localData[tgx].bornSum
;
}
lasty
=
y
;
pos++
;
}
}
/**
*
First
part
of
computing
the
GBSA
interaction.
*/
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeGBSAForce1
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
float*
global_bornRadii,
__global
float*
global_bornForce,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
)
{
#
else
unsigned
int
numTiles
)
{
#
endif
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
#
else
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x,
y
;
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y++
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
}
//
Load
the
data
for
this
tile
if
we
don
't
already
have
it
cached.
if
(
lasty
!=
y
)
{
for
(
int
localAtomIndex
=
0
; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned
int
j
=
y*TILE_SIZE
+
localAtomIndex
;
float4
tempPosq
=
posq[j]
;
localData[localAtomIndex].x
=
tempPosq.x
;
localData[localAtomIndex].y
=
tempPosq.y
;
localData[localAtomIndex].z
=
tempPosq.z
;
localData[localAtomIndex].q
=
tempPosq.w
;
localData[localAtomIndex].bornRadius
=
global_bornRadii[j]
;
}
}
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
float4
force
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float
bornRadius1
=
global_bornRadii[atom1]
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
float4
posq2
=
(
float4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
float
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+j
<
NUM_ATOMS
)
{
#
endif
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
float
bornRadius2
=
localData[j].bornRadius
;
float
alpha2_ij
=
bornRadius1*bornRadius2
;
float
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
float
expTerm
=
EXP
(
-D_ij
)
;
float
denominator2
=
r2
+
alpha2_ij*expTerm
;
float
denominator
=
SQRT
(
denominator2
)
;
float
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
*RECIP
(
denominator
)
;
float
Gpol
=
tempEnergy*RECIP
(
denominator2
)
;
float
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
float
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
energy
+=
0.5f*tempEnergy
;
force.xyz
-=
delta.xyz*dEdR
;
}
}
//
Write
results.
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
=
forceBuffers[offset].xyz+force.xyz
;
global_bornForce[offset]
+=
force.w
;
}
}
else
{
//
This
is
an
off-diagonal
tile.
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
localData[tgx].fx
=
0.0f
;
localData[tgx].fy
=
0.0f
;
localData[tgx].fz
=
0.0f
;
localData[tgx].fw
=
0.0f
;
}
//
Compute
the
full
set
of
interactions
in
this
tile.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
float4
force
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float
bornRadius1
=
global_bornRadii[atom1]
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
float4
posq2
=
(
float4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
float
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+j
<
NUM_ATOMS
)
{
#
endif
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
float
bornRadius2
=
localData[j].bornRadius
;
float
alpha2_ij
=
bornRadius1*bornRadius2
;
float
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
float
expTerm
=
EXP
(
-D_ij
)
;
float
denominator2
=
r2
+
alpha2_ij*expTerm
;
float
denominator
=
SQRT
(
denominator2
)
;
float
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
*RECIP
(
denominator
)
;
float
Gpol
=
tempEnergy*RECIP
(
denominator2
)
;
float
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
float
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
localData[j].fx
+=
delta.x
;
localData[j].fy
+=
delta.y
;
localData[j].fz
+=
delta.z
;
localData[j].fw
+=
dGpol_dalpha2_ij*bornRadius1
;
}
}
//
Write
results
for
atom1.
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
=
forceBuffers[offset].xyz+force.xyz
;
}
}
//
Write
results
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
offset
=
y*TILE_SIZE+tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
float4
f
=
forceBuffers[offset]
;
f.x
+=
localData[tgx].fx
;
f.y
+=
localData[tgx].fy
;
f.z
+=
localData[tgx].fz
;
forceBuffers[offset]
=
f
;
global_bornForce[offset]
+=
localData[tgx].fw
;
}
lasty
=
y
;
pos++
;
}
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
platforms/opencl/src/kernels/gbsaObc_default.cl
View file @
38a60fbf
...
...
@@ -29,7 +29,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
while
(
pos
<
end
)
{
...
...
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
View file @
38a60fbf
...
...
@@ -31,7 +31,6 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
unsigned
int
pos
=
warp*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
#
endif
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
while
(
pos
<
end
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment