Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
95e059dc
Commit
95e059dc
authored
Oct 28, 2009
by
Peter Eastman
Browse files
Created OpenCL implementation of GBSAOBCForce that works with non-Nvidia processors
parent
f0cab877
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
516 additions
and
180 deletions
+516
-180
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+32
-44
platforms/opencl/src/OpenCLKernels.h
platforms/opencl/src/OpenCLKernels.h
+1
-1
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+8
-0
platforms/opencl/src/kernels/gbsaObcReductions.cl
platforms/opencl/src/kernels/gbsaObcReductions.cl
+67
-0
platforms/opencl/src/kernels/gbsaObc_default.cl
platforms/opencl/src/kernels/gbsaObc_default.cl
+340
-0
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+49
-116
platforms/opencl/src/kernels/nonbonded_nvidia.cl
platforms/opencl/src/kernels/nonbonded_nvidia.cl
+19
-19
No files found.
platforms/opencl/src/OpenCLKernels.cpp
View file @
95e059dc
...
...
@@ -853,28 +853,15 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
defines
[
"USE_CUTOFF"
]
=
"1"
;
if
(
nb
.
getUsePeriodic
())
defines
[
"USE_PERIODIC"
]
=
"1"
;
stringstream
xsize
,
ysize
,
zsize
,
cutoffSquared
,
prefac
;
xsize
.
precision
(
8
);
ysize
.
precision
(
8
);
zsize
.
precision
(
8
);
cutoffSquared
.
precision
(
8
);
prefac
.
precision
(
8
);
xsize
<<
scientific
<<
nb
.
getPeriodicBoxSize
().
x
<<
"f"
;
ysize
<<
scientific
<<
nb
.
getPeriodicBoxSize
().
y
<<
"f"
;
zsize
<<
scientific
<<
nb
.
getPeriodicBoxSize
().
z
<<
"f"
;
cutoffSquared
<<
scientific
<<
(
nb
.
getCutoffDistance
()
*
nb
.
getCutoffDistance
())
<<
"f"
;
prefac
<<
scientific
<<
prefactor
<<
"f"
;
defines
[
"PERIODIC_BOX_SIZE_X"
]
=
xsize
.
str
();
defines
[
"PERIODIC_BOX_SIZE_Y"
]
=
ysize
.
str
();
defines
[
"PERIODIC_BOX_SIZE_Z"
]
=
zsize
.
str
();
defines
[
"CUTOFF_SQUARED"
]
=
cutoffSquared
.
str
();
defines
[
"PREFACTOR"
]
=
prefac
.
str
();
stringstream
natom
,
padded
;
natom
<<
cl
.
getNumAtoms
();
padded
<<
cl
.
getPaddedNumAtoms
();
defines
[
"NUM_ATOMS"
]
=
natom
.
str
();
defines
[
"PADDED_NUM_ATOMS"
]
=
padded
.
str
();
cl
::
Program
program
=
cl
.
createProgram
(
cl
.
loadSourceFromFile
(
"gbsaObc.cl"
),
defines
);
defines
[
"PERIODIC_BOX_SIZE_X"
]
=
doubleToString
(
nb
.
getPeriodicBoxSize
().
x
);
defines
[
"PERIODIC_BOX_SIZE_Y"
]
=
doubleToString
(
nb
.
getPeriodicBoxSize
().
y
);
defines
[
"PERIODIC_BOX_SIZE_Z"
]
=
doubleToString
(
nb
.
getPeriodicBoxSize
().
z
);
defines
[
"CUTOFF_SQUARED"
]
=
doubleToString
(
nb
.
getCutoffDistance
()
*
nb
.
getCutoffDistance
());
defines
[
"PREFACTOR"
]
=
doubleToString
(
prefactor
);
defines
[
"NUM_ATOMS"
]
=
intToString
(
cl
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
intToString
(
cl
.
getPaddedNumAtoms
());
string
filename
=
(
cl
.
getSIMDWidth
()
==
32
?
"gbsaObc_nvidia.cl"
:
"gbsaObc_default.cl"
);
cl
::
Program
program
=
cl
.
createProgram
(
cl
.
loadSourceFromFile
(
filename
),
defines
);
computeBornSumKernel
=
cl
::
Kernel
(
program
,
"computeBornSum"
);
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
bornSum
->
getDeviceBuffer
());
computeBornSumKernel
.
setArg
(
1
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
...
...
@@ -882,26 +869,16 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
computeBornSumKernel
.
setArg
(
3
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float4
),
NULL
);
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
params
->
getDeviceBuffer
());
computeBornSumKernel
.
setArg
(
5
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float2
),
NULL
);
computeBornSumKernel
.
setArg
(
6
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
if
(
nb
.
getUseCutoff
())
{
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
(
9
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
9
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
}
else
{
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
nb
.
getTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl_uint
>
(
7
,
nb
.
getTiles
().
getSize
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
nb
.
getTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl_uint
>
(
8
,
nb
.
getTiles
().
getSize
());
}
reduceBornSumKernel
=
cl
::
Kernel
(
program
,
"reduceBornSum"
);
reduceBornSumKernel
.
setArg
<
cl_int
>
(
0
,
cl
.
getPaddedNumAtoms
());
reduceBornSumKernel
.
setArg
<
cl_int
>
(
1
,
cl
.
getNumForceBuffers
());
reduceBornSumKernel
.
setArg
<
cl_float
>
(
2
,
1.0
f
);
reduceBornSumKernel
.
setArg
<
cl_float
>
(
3
,
0.8
f
);
reduceBornSumKernel
.
setArg
<
cl_float
>
(
4
,
4.85
f
);
reduceBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
bornSum
->
getDeviceBuffer
());
reduceBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
params
->
getDeviceBuffer
());
reduceBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
bornRadii
->
getDeviceBuffer
());
reduceBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
obcChain
->
getDeviceBuffer
());
force1Kernel
=
cl
::
Kernel
(
program
,
"computeGBSAForce1"
);
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getForceBuffers
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getEnergyBuffer
().
getDeviceBuffer
());
...
...
@@ -912,16 +889,27 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
force1Kernel
.
setArg
(
6
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
7
,
bornForce
->
getDeviceBuffer
());
force1Kernel
.
setArg
(
8
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
force1Kernel
.
setArg
(
9
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
mm_float4
),
NULL
);
if
(
nb
.
getUseCutoff
())
{
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
9
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
10
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
11
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
force1Kernel
.
setArg
(
12
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
mm_float4
),
NULL
);
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
10
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
11
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
12
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
}
else
{
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
9
,
nb
.
getTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl_uint
>
(
1
0
,
nb
.
getTiles
().
getSize
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
10
,
nb
.
getTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl_uint
>
(
1
1
,
nb
.
getTiles
().
getSize
());
}
program
=
cl
.
createProgram
(
cl
.
loadSourceFromFile
(
"gbsaObcReductions.cl"
),
defines
);
reduceBornSumKernel
=
cl
::
Kernel
(
program
,
"reduceBornSum"
);
reduceBornSumKernel
.
setArg
<
cl_int
>
(
0
,
cl
.
getPaddedNumAtoms
());
reduceBornSumKernel
.
setArg
<
cl_int
>
(
1
,
cl
.
getNumForceBuffers
());
reduceBornSumKernel
.
setArg
<
cl_float
>
(
2
,
1.0
f
);
reduceBornSumKernel
.
setArg
<
cl_float
>
(
3
,
0.8
f
);
reduceBornSumKernel
.
setArg
<
cl_float
>
(
4
,
4.85
f
);
reduceBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
bornSum
->
getDeviceBuffer
());
reduceBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
params
->
getDeviceBuffer
());
reduceBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
bornRadii
->
getDeviceBuffer
());
reduceBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
obcChain
->
getDeviceBuffer
());
reduceBornForceKernel
=
cl
::
Kernel
(
program
,
"reduceBornForce"
);
reduceBornForceKernel
.
setArg
<
cl_int
>
(
0
,
cl
.
getPaddedNumAtoms
());
reduceBornForceKernel
.
setArg
<
cl_int
>
(
1
,
cl
.
getNumForceBuffers
());
...
...
platforms/opencl/src/OpenCLKernels.h
View file @
95e059dc
...
...
@@ -300,7 +300,7 @@ private:
class
OpenCLCalcNonbondedForceKernel
:
public
CalcNonbondedForceKernel
{
public:
OpenCLCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
OpenCLContext
&
cl
,
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
cl
(
cl
),
exceptionParams
(
NULL
),
exceptionIndices
(
NULL
)
{
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
exceptionIndices
(
NULL
)
,
cosSinSums
(
NULL
)
{
}
~
OpenCLCalcNonbondedForceKernel
();
/**
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
95e059dc
...
...
@@ -107,6 +107,14 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
if
(
cutoff
==
-
1.0
)
return
;
// There are no nonbonded interactions in the System.
if
(
atomExclusions
.
size
()
==
0
)
{
// No exclusions were specifically requested, so just mark every atom as not interacting with itself.
atomExclusions
.
resize
(
context
.
getNumAtoms
());
for
(
int
i
=
0
;
i
<
atomExclusions
.
size
();
i
++
)
atomExclusions
[
i
].
push_back
(
i
);
}
// Create the list of tiles.
int
numAtomBlocks
=
context
.
getNumAtomBlocks
();
...
...
platforms/opencl/src/kernels/gbsaObcReductions.cl
0 → 100644
View file @
95e059dc
const
float
dielectricOffset
=
0.009f
;
const
float
probeRadius
=
0.14f
;
const
float
surfaceAreaFactor
=
-6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f
;
/**
*
Reduce
the
Born
sums
to
compute
the
Born
radii.
*/
__kernel
void
reduceBornSum
(
int
bufferSize,
int
numBuffers,
float
alpha,
float
beta,
float
gamma,
__global
float*
bornSum,
__global
float2*
params,
__global
float*
bornRadii,
__global
float*
obcChain
)
{
unsigned
int
index
=
get_global_id
(
0
)
;
while
(
index
<
NUM_ATOMS
)
{
//
Get
summed
Born
data
int
totalSize
=
bufferSize*numBuffers
;
float
sum
=
bornSum[index]
;
for
(
int
i
=
index+bufferSize
; i < totalSize; i += bufferSize)
sum
+=
bornSum[i]
;
//
Now
calculate
Born
radius
and
OBC
term.
float
offsetRadius
=
params[index].x
;
sum
*=
0.5f*offsetRadius
;
float
sum2
=
sum*sum
;
float
sum3
=
sum*sum2
;
float
tanhSum
=
tanh
(
alpha*sum
-
beta*sum2
+
gamma*sum3
)
;
float
nonOffsetRadius
=
offsetRadius
+
dielectricOffset
;
float
radius
=
1.0f/
(
1.0f/offsetRadius
-
tanhSum/nonOffsetRadius
)
;
float
chain
=
offsetRadius*
(
alpha
-
2.0f*beta*sum
+
3.0f*gamma*sum2
)
;
chain
=
(
1.0f-tanhSum*tanhSum
)
*chain
/
nonOffsetRadius
;
bornRadii[index]
=
radius
;
obcChain[index]
=
chain
;
index
+=
get_global_size
(
0
)
;
}
}
/**
*
Reduce
the
Born
force.
*/
__kernel
void
reduceBornForce
(
int
bufferSize,
int
numBuffers,
__global
float*
bornForce,
__global
float*
energyBuffer,
__global
float2*
params,
__global
float*
bornRadii,
__global
float*
obcChain
)
{
float
energy
=
0.0f
;
unsigned
int
index
=
get_global_id
(
0
)
;
while
(
index
<
NUM_ATOMS
)
{
//
Sum
the
Born
force
int
totalSize
=
bufferSize*numBuffers
;
float
force
=
bornForce[index]
;
for
(
int
i
=
index+bufferSize
; i < totalSize; i += bufferSize)
force
+=
bornForce[i]
;
//
Now
calculate
the
actual
force
float
offsetRadius
=
params[index].x
;
float
bornRadius
=
bornRadii[index]
;
float
r
=
offsetRadius+dielectricOffset+probeRadius
;
float
ratio6
=
pow
((
offsetRadius+dielectricOffset
)
/bornRadius,
6.0f
)
;
float
saTerm
=
surfaceAreaFactor*r*r*ratio6
;
force
+=
saTerm/bornRadius
;
energy
+=
saTerm
;
force
*=
bornRadius*bornRadius*obcChain[index]
;
bornForce[index]
=
force
;
index
+=
get_global_size
(
0
)
;
}
energyBuffer[get_global_id
(
0
)
]
+=
energy/-6.0f
;
}
\ No newline at end of file
platforms/opencl/src/kernels/gbsaObc_default.cl
0 → 100644
View file @
95e059dc
#
define
TILE_SIZE
32
/**
*
Compute
the
Born
sum.
*/
__kernel
void
computeBornSum
(
__global
float*
global_bornSum,
__local
float*
local_bornSum,
__global
float4*
posq,
__local
float4*
local_posq,
__global
float2*
global_params,
__local
float2*
local_params,
__local
float*
tempBuffer,
__global
unsigned
int*
tiles,
#
ifdef
USE_CUTOFF
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount
)
{
#
else
unsigned
int
numTiles
)
{
#
endif
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
#
endif
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*TILE_SIZE
;
x
=
(
x>>17
)
*TILE_SIZE
;
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
forceBufferOffset
=
(
tgx
<
TILE_SIZE/2
?
0
:
TILE_SIZE
)
;
unsigned
int
atom1
=
x
+
tgx
;
float
bornSum
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float2
params1
=
global_params[atom1]
;
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
local_posq[get_local_id
(
0
)
]
=
posq1
;
local_params[get_local_id
(
0
)
]
=
params1
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*
(
xi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
float4
delta
=
(
float4
)
(
local_posq[baseLocalAtom+j].xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/PERIODIC_BOX_SIZE_X+0.5f
)
*PERIODIC_BOX_SIZE_X
;
delta.y
-=
floor
(
delta.y/PERIODIC_BOX_SIZE_Y+0.5f
)
*PERIODIC_BOX_SIZE_Y
;
delta.z
-=
floor
(
delta.z/PERIODIC_BOX_SIZE_Z+0.5f
)
*PERIODIC_BOX_SIZE_Z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y+baseLocalAtom+j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
y+baseLocalAtom+j
<
NUM_ATOMS
)
{
#
endif
float
r
=
sqrt
(
r2
)
;
float
invR
=
1.0f/r
;
float2
params2
=
local_params[baseLocalAtom+j]
;
float
rScaledRadiusJ
=
r+params2.y
;
if
((
j
!=
tgx
)
&&
(
params1.x
<
rScaledRadiusJ
))
{
float
l_ij
=
1.0f/max
(
params1.x,
fabs
(
r-params2.y
))
;
float
u_ij
=
1.0f/rScaledRadiusJ
;
float
l_ij2
=
l_ij*l_ij
;
float
u_ij2
=
u_ij*u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
)
;
bornSum
+=
l_ij
-
u_ij
+
0.25f*r*
(
u_ij2-l_ij2
)
+
(
0.50f*invR*ratio
)
+
(
0.25f*params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
)
;
if
(
params1.x
<
params2.x-r
)
bornSum
+=
2.0f*
(
1.0f/params1.x-l_ij
)
;
}
}
}
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
tempBuffer[get_local_id
(
0
)
]
=
bornSum
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset
=
x
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset
=
x
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
global_bornSum[offset]
+=
bornSum+tempBuffer[get_local_id
(
0
)
+TILE_SIZE]
;
}
}
else
{
//
This
is
an
off-diagonal
tile.
if
(
lasty
!=
y
&&
get_local_id
(
0
)
<
TILE_SIZE
)
{
unsigned
int
j
=
y
+
tgx
;
local_posq[get_local_id
(
0
)
]
=
posq[j]
;
local_params[get_local_id
(
0
)
]
=
global_params[j]
;
}
local_bornSum[get_local_id
(
0
)
]
=
0.0f
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
yi
=
y/TILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*
(
yi+1
)
/2
;
unsigned
int
tj
=
tgx%
(
TILE_SIZE/2
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
float4
delta
=
(
float4
)
(
local_posq[baseLocalAtom+tj].xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/PERIODIC_BOX_SIZE_X+0.5f
)
*PERIODIC_BOX_SIZE_X
;
delta.y
-=
floor
(
delta.y/PERIODIC_BOX_SIZE_Y+0.5f
)
*PERIODIC_BOX_SIZE_Y
;
delta.z
-=
floor
(
delta.z/PERIODIC_BOX_SIZE_Z+0.5f
)
*PERIODIC_BOX_SIZE_Z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y+baseLocalAtom+tj
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
y+baseLocalAtom+tj
<
NUM_ATOMS
)
{
#
endif
float
r
=
sqrt
(
r2
)
;
float
invR
=
1.0f/r
;
float2
params2
=
local_params[baseLocalAtom+tj]
;
float
rScaledRadiusJ
=
r+params2.y
;
if
(
params1.x
<
rScaledRadiusJ
)
{
float
l_ij
=
1.0f/max
(
params1.x,
fabs
(
r-params2.y
))
;
float
u_ij
=
1.0f/rScaledRadiusJ
;
float
l_ij2
=
l_ij*l_ij
;
float
u_ij2
=
u_ij*u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
)
;
bornSum
+=
l_ij
-
u_ij
+
0.25f*r*
(
u_ij2-l_ij2
)
+
(
0.50f*invR*ratio
)
+
(
0.25f*params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
)
;
if
(
params1.x
<
params2.x-r
)
bornSum
+=
2.0f*
(
1.0f/params1.x-l_ij
)
;
}
float
rScaledRadiusI
=
r+params1.y
;
if
(
params2.x
<
rScaledRadiusJ
)
{
float
l_ij
=
1.0f/max
(
params2.x,
fabs
(
r-params1.y
))
;
float
u_ij
=
1.0f/rScaledRadiusJ
;
float
l_ij2
=
l_ij*l_ij
;
float
u_ij2
=
u_ij*u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
)
;
float
term
=
l_ij
-
u_ij
+
0.25f*r*
(
u_ij2-l_ij2
)
+
(
0.50f*invR*ratio
)
+
(
0.25f*params1.y*params1.y*invR
)
*
(
l_ij2-u_ij2
)
;
if
(
params2.x
<
params1.x-r
)
term
+=
2.0f*
(
1.0f/params2.x-l_ij
)
;
local_bornSum[baseLocalAtom+tj+forceBufferOffset]
+=
term
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
tj
=
(
tj+1
)
%
(
TILE_SIZE/2
)
;
}
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
tempBuffer[get_local_id
(
0
)
]
=
bornSum
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset1
=
x
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
global_bornSum[offset1]
+=
bornSum+tempBuffer[get_local_id
(
0
)
+TILE_SIZE]
;
global_bornSum[offset2]
+=
local_bornSum[get_local_id
(
0
)
]+local_bornSum[get_local_id
(
0
)
+TILE_SIZE]
;
}
lasty
=
y
;
}
pos++
;
}
}
/**
*
First
part
of
computing
the
GBSA
interaction.
*/
__kernel
void
computeGBSAForce1
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__local
float4*
local_posq,
__local
float4*
local_force,
__global
float*
global_bornRadii,
__local
float*
local_bornRadii,
__global
float*
global_bornForce,
__local
float*
local_bornForce,
__local
float4*
tempBuffer,
__global
unsigned
int*
tiles,
#
ifdef
USE_CUTOFF
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount
)
{
#
else
unsigned
int
numTiles
)
{
#
endif
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
#
endif
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*TILE_SIZE
;
x
=
(
x>>17
)
*TILE_SIZE
;
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
forceBufferOffset
=
(
tgx
<
TILE_SIZE/2
?
0
:
TILE_SIZE
)
;
unsigned
int
atom1
=
x
+
tgx
;
float4
force
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float
bornRadius1
=
global_bornRadii[atom1]
;
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
local_posq[get_local_id
(
0
)
]
=
posq1
;
local_bornRadii[get_local_id
(
0
)
]
=
bornRadius1
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*
(
xi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
if
(
atom1
<
NUM_ATOMS
&&
y+baseLocalAtom+j
<
NUM_ATOMS
)
{
float4
posq2
=
local_posq[baseLocalAtom+j]
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/PERIODIC_BOX_SIZE_X+0.5f
)
*PERIODIC_BOX_SIZE_X
;
delta.y
-=
floor
(
delta.y/PERIODIC_BOX_SIZE_Y+0.5f
)
*PERIODIC_BOX_SIZE_Y
;
delta.z
-=
floor
(
delta.z/PERIODIC_BOX_SIZE_Z+0.5f
)
*PERIODIC_BOX_SIZE_Z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r
=
sqrt
(
r2
)
;
float
invR
=
1.0f/r
;
float
bornRadius2
=
local_bornRadii[baseLocalAtom+j]
;
float
alpha2_ij
=
bornRadius1*bornRadius2
;
float
D_ij
=
r2/
(
4.0f*alpha2_ij
)
;
float
expTerm
=
exp
(
-D_ij
)
;
float
denominator2
=
r2
+
alpha2_ij*expTerm
;
float
denominator
=
sqrt
(
denominator2
)
;
float
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
/denominator
;
float
Gpol
=
tempEnergy/denominator2
;
float
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
float
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
#
ifdef
USE_CUTOFF
if
(
r2
>
CUTOFF_SQUARED
)
{
dEdR
=
0.0f
;
tempEnergy
=
0.0f
;
}
#
endif
energy
+=
0.5f*tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
}
}
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
tempBuffer[get_local_id
(
0
)
]
=
force
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset
=
x
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset
=
x
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
forceBuffers[offset].xyz
=
forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
global_bornForce[offset]
+=
force.w+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].w
;
}
}
else
{
//
This
is
an
off-diagonal
tile.
if
(
lasty
!=
y
&&
get_local_id
(
0
)
<
TILE_SIZE
)
{
unsigned
int
j
=
y
+
tgx
;
local_posq[get_local_id
(
0
)
]
=
posq[j]
;
local_bornRadii[get_local_id
(
0
)
]
=
global_bornRadii[j]
;
}
local_force[get_local_id
(
0
)
]
=
0.0f
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
yi
=
y/TILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*
(
yi+1
)
/2
;
unsigned
int
tj
=
tgx%
(
TILE_SIZE/2
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
if
(
atom1
<
NUM_ATOMS
&&
y+baseLocalAtom+tj
<
NUM_ATOMS
)
{
float4
posq2
=
local_posq[baseLocalAtom+tj]
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/PERIODIC_BOX_SIZE_X+0.5f
)
*PERIODIC_BOX_SIZE_X
;
delta.y
-=
floor
(
delta.y/PERIODIC_BOX_SIZE_Y+0.5f
)
*PERIODIC_BOX_SIZE_Y
;
delta.z
-=
floor
(
delta.z/PERIODIC_BOX_SIZE_Z+0.5f
)
*PERIODIC_BOX_SIZE_Z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r
=
sqrt
(
r2
)
;
float
invR
=
1.0f/r
;
float
bornRadius2
=
local_bornRadii[baseLocalAtom+tj]
;
float
alpha2_ij
=
bornRadius1*bornRadius2
;
float
D_ij
=
r2/
(
4.0f*alpha2_ij
)
;
float
expTerm
=
exp
(
-D_ij
)
;
float
denominator2
=
r2
+
alpha2_ij*expTerm
;
float
denominator
=
sqrt
(
denominator2
)
;
float
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
/denominator
;
float
Gpol
=
tempEnergy/denominator2
;
float
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
float
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
#
ifdef
USE_CUTOFF
if
(
r2
>
CUTOFF_SQUARED
)
{
dEdR
=
0.0f
;
tempEnergy
=
0.0f
;
}
#
endif
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
local_force[baseLocalAtom+tj+forceBufferOffset]
+=
(
float4
)
(
delta.xyz,
dGpol_dalpha2_ij*bornRadius1
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
tj
=
(
tj+1
)
%
(
TILE_SIZE/2
)
;
}
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
tempBuffer[get_local_id
(
0
)
]
=
force
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset1
=
x
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
forceBuffers[offset1].xyz
=
forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
forceBuffers[offset2].xyz
=
forceBuffers[offset2].xyz+local_force[get_local_id
(
0
)
].xyz+local_force[get_local_id
(
0
)
+TILE_SIZE].xyz
;
global_bornForce[offset1]
+=
force.w+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].w
;
global_bornForce[offset2]
+=
local_force[get_local_id
(
0
)
].w+local_force[get_local_id
(
0
)
+TILE_SIZE].w
;
}
lasty
=
y
;
}
pos++
;
}
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
platforms/opencl/src/kernels/gbsaObc.cl
→
platforms/opencl/src/kernels/gbsaObc
_nvidia
.cl
View file @
95e059dc
const
unsigned
int
TileSize
=
32
;
const
float
dielectricOffset
=
0.009f
;
const
float
probeRadius
=
0.14f
;
const
float
surfaceAreaFactor
=
-6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f
;
#
define
TILE_SIZE
32
/**
*
Compute
the
Born
sum.
*/
__kernel
void
computeBornSum
(
__global
float*
global_bornSum,
__local
float*
local_bornSum,
__global
float4*
posq,
__local
float4*
local_posq,
__global
float2*
global_params,
__local
float2*
local_params,
__global
unsigned
int*
tiles,
__local
float4*
local_posq,
__global
float2*
global_params,
__local
float2*
local_params,
__local
float*
tempBuffer,
__global
unsigned
int*
tiles,
#
ifdef
USE_CUTOFF
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount
,
__local
float*
tempBuffer
)
{
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount
)
{
#
else
unsigned
int
numTiles
)
{
#
endif
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
#
endif
unsigned
int
totalWarps
=
get_global_size
(
0
)
/T
ileSize
;
unsigned
int
warp
=
get_global_id
(
0
)
/T
ileSize
;
unsigned
int
totalWarps
=
get_global_size
(
0
)
/T
ILE_SIZE
;
unsigned
int
warp
=
get_global_id
(
0
)
/T
ILE_SIZE
;
unsigned
int
pos
=
warp*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
float
energy
=
0.0f
;
...
...
@@ -27,11 +24,10 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*T
ileSize
;
x
=
(
x>>17
)
*T
ileSize
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
T
ileSize
-1
)
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*T
ILE_SIZE
;
x
=
(
x>>17
)
*T
ILE_SIZE
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
T
ILE_SIZE
-1
)
;
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
unsigned
int
tj
=
tgx
;
unsigned
int
atom1
=
x
+
tgx
;
float
bornSum
=
0.0f
;
float4
posq1
=
posq[atom1]
;
...
...
@@ -41,9 +37,9 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
local_posq[get_local_id
(
0
)
]
=
posq1
;
local_params[get_local_id
(
0
)
]
=
params1
;
unsigned
int
xi
=
x/T
ileSize
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/T
ileSize
-xi*
(
xi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < T
ileSize
; j++) {
unsigned
int
xi
=
x/T
ILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/T
ILE_SIZE
-xi*
(
xi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < T
ILE_SIZE
; j++) {
float4
delta
=
(
float4
)
(
local_posq[tbx+j].xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/PERIODIC_BOX_SIZE_X+0.5f
)
*PERIODIC_BOX_SIZE_X
;
...
...
@@ -76,7 +72,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset
=
x
+
tgx
+
(
x/T
ileSize
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
+
tgx
+
(
x/T
ILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
#
endif
...
...
@@ -100,7 +96,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
else
{
//
Compute
only
a
subset
of
the
interactions
in
this
tile.
for
(
unsigned
int
j
=
0
; j < T
ileSize
; j++) {
for
(
unsigned
int
j
=
0
; j < T
ILE_SIZE
; j++) {
if
((
flags&
(
1<<j
))
!=
0
)
{
float4
delta
=
(
float4
)
(
local_posq[tbx+j].xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
...
...
@@ -166,10 +162,11 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
{
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/TileSize
;
unsigned
int
yi
=
y/TileSize
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TileSize-yi*
(
yi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < TileSize; j++) {
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
yi
=
y/TILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*
(
yi+1
)
/2
;
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
float4
delta
=
(
float4
)
(
local_posq[tbx+tj].xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/PERIODIC_BOX_SIZE_X+0.5f
)
*PERIODIC_BOX_SIZE_X
;
...
...
@@ -211,15 +208,15 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
local_bornSum[tbx+tj]
+=
term
;
}
}
tj
=
(
tj
+
1
)
&
(
T
ileSize
-
1
)
;
tj
=
(
tj
+
1
)
&
(
T
ILE_SIZE
-
1
)
;
}
}
//
Write
results
float4
of
;
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/T
ileSize
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/T
ileSize
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
+
tgx
+
(
y/T
ILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/T
ILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset1
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
warp*PADDED_NUM_ATOMS
;
...
...
@@ -232,55 +229,23 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
}
}
/**
*
Reduce
the
Born
sums
to
compute
the
Born
radii.
*/
__kernel
void
reduceBornSum
(
int
bufferSize,
int
numBuffers,
float
alpha,
float
beta,
float
gamma,
__global
float*
bornSum,
__global
float2*
params,
__global
float*
bornRadii,
__global
float*
obcChain
)
{
unsigned
int
index
=
get_global_id
(
0
)
;
while
(
index
<
NUM_ATOMS
)
{
//
Get
summed
Born
data
int
totalSize
=
bufferSize*numBuffers
;
float
sum
=
bornSum[index]
;
for
(
int
i
=
index+bufferSize
; i < totalSize; i += bufferSize)
sum
+=
bornSum[i]
;
//
Now
calculate
Born
radius
and
OBC
term.
float
offsetRadius
=
params[index].x
;
sum
*=
0.5f*offsetRadius
;
float
sum2
=
sum*sum
;
float
sum3
=
sum*sum2
;
float
tanhSum
=
tanh
(
alpha*sum
-
beta*sum2
+
gamma*sum3
)
;
float
nonOffsetRadius
=
offsetRadius
+
dielectricOffset
;
float
radius
=
1.0f/
(
1.0f/offsetRadius
-
tanhSum/nonOffsetRadius
)
;
float
chain
=
offsetRadius*
(
alpha
-
2.0f*beta*sum
+
3.0f*gamma*sum2
)
;
chain
=
(
1.0f-tanhSum*tanhSum
)
*chain
/
nonOffsetRadius
;
bornRadii[index]
=
radius
;
obcChain[index]
=
chain
;
index
+=
get_global_size
(
0
)
;
}
}
/**
*
First
part
of
computing
the
GBSA
interaction.
*/
__kernel
void
computeGBSAForce1
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__local
float4*
local_posq,
__local
float4*
local_force,
__global
float*
global_bornRadii,
__local
float*
local_bornRadii,
__global
float*
global_bornForce,
__local
float*
local_bornForce,
__global
unsigned
int*
tiles,
__global
float*
global_bornForce,
__local
float*
local_bornForce,
__local
float4*
tempBuffer,
__global
unsigned
int*
tiles,
#
ifdef
USE_CUTOFF
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount
,
__local
float4*
tempBuffer
)
{
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount
)
{
#
else
unsigned
int
numTiles
)
{
#
endif
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
#
endif
unsigned
int
totalWarps
=
get_global_size
(
0
)
/T
ileSize
;
unsigned
int
warp
=
get_global_id
(
0
)
/T
ileSize
;
unsigned
int
totalWarps
=
get_global_size
(
0
)
/T
ILE_SIZE
;
unsigned
int
warp
=
get_global_id
(
0
)
/T
ILE_SIZE
;
unsigned
int
pos
=
warp*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
float
energy
=
0.0f
;
...
...
@@ -289,11 +254,10 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*T
ileSize
;
x
=
(
x>>17
)
*T
ileSize
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
T
ileSize
-1
)
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*T
ILE_SIZE
;
x
=
(
x>>17
)
*T
ILE_SIZE
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
T
ILE_SIZE
-1
)
;
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
unsigned
int
tj
=
tgx
;
unsigned
int
atom1
=
x
+
tgx
;
float4
force
=
0.0f
;
float4
posq1
=
posq[atom1]
;
...
...
@@ -303,11 +267,12 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
local_posq[get_local_id
(
0
)
]
=
posq1
;
local_bornRadii[get_local_id
(
0
)
]
=
bornRadius1
;
unsigned
int
xi
=
x/T
ileSize
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/T
ileSize
-xi*
(
xi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < T
ileSize
; j++) {
unsigned
int
xi
=
x/T
ILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/T
ILE_SIZE
-xi*
(
xi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < T
ILE_SIZE
; j++) {
if
(
atom1
<
NUM_ATOMS
&&
y+j
<
NUM_ATOMS
)
{
float4
delta
=
(
float4
)
(
local_posq[tbx+j].xyz
-
posq1.xyz,
0.0f
)
;
float4
posq2
=
local_posq[tbx+j]
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/PERIODIC_BOX_SIZE_X+0.5f
)
*PERIODIC_BOX_SIZE_X
;
delta.y
-=
floor
(
delta.y/PERIODIC_BOX_SIZE_Y+0.5f
)
*PERIODIC_BOX_SIZE_Y
;
...
...
@@ -316,7 +281,6 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r
=
sqrt
(
r2
)
;
float
invR
=
1.0f/r
;
float4
posq2
=
local_posq[tbx+j]
;
float
bornRadius2
=
local_bornRadii[tbx+j]
;
float
alpha2_ij
=
bornRadius1*bornRadius2
;
float
D_ij
=
r2/
(
4.0f*alpha2_ij
)
;
...
...
@@ -342,7 +306,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset
=
x
+
tgx
+
(
x/T
ileSize
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
+
tgx
+
(
x/T
ILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
#
endif
...
...
@@ -367,9 +331,10 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
else
{
//
Compute
only
a
subset
of
the
interactions
in
this
tile.
for
(
unsigned
int
j
=
0
; j < T
ileSize
; j++) {
for
(
unsigned
int
j
=
0
; j < T
ILE_SIZE
; j++) {
if
((
flags&
(
1<<j
))
!=
0
)
{
float4
delta
=
(
float4
)
(
local_posq[tbx+j].xyz
-
posq1.xyz,
0.0f
)
;
float4
posq2
=
local_posq[tbx+j]
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/PERIODIC_BOX_SIZE_X+0.5f
)
*PERIODIC_BOX_SIZE_X
;
delta.y
-=
floor
(
delta.y/PERIODIC_BOX_SIZE_Y+0.5f
)
*PERIODIC_BOX_SIZE_Y
;
...
...
@@ -377,8 +342,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r
=
sqrt
(
r2
)
;
float
invR
=
1.0f
/
r
;
float4
posq2
=
local_posq[tbx+j]
;
float
invR
=
1.0f/r
;
float
bornRadius2
=
local_bornRadii[tbx+j]
;
float
alpha2_ij
=
bornRadius1*bornRadius2
;
float
D_ij
=
r2/
(
4.0f*alpha2_ij
)
;
...
...
@@ -424,12 +388,14 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
{
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/TileSize
;
unsigned
int
yi
=
y/TileSize
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TileSize-yi*
(
yi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < TileSize; j++) {
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
yi
=
y/TILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*
(
yi+1
)
/2
;
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
if
(
atom1
<
NUM_ATOMS
&&
y+tj
<
NUM_ATOMS
)
{
float4
delta
=
(
float4
)
(
local_posq[tbx+tj].xyz
-
posq1.xyz,
0.0f
)
;
float4
posq2
=
local_posq[tbx+tj]
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/PERIODIC_BOX_SIZE_X+0.5f
)
*PERIODIC_BOX_SIZE_X
;
delta.y
-=
floor
(
delta.y/PERIODIC_BOX_SIZE_Y+0.5f
)
*PERIODIC_BOX_SIZE_Y
;
...
...
@@ -437,8 +403,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r
=
sqrt
(
r2
)
;
float
invR
=
1.0f
/
r
;
float4
posq2
=
local_posq[tbx+tj]
;
float
invR
=
1.0f/r
;
float
bornRadius2
=
local_bornRadii[tbx+tj]
;
float
alpha2_ij
=
bornRadius1*bornRadius2
;
float
D_ij
=
r2/
(
4.0f*alpha2_ij
)
;
...
...
@@ -461,14 +426,14 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
force.xyz
-=
delta.xyz
;
local_force[tbx+tj]
+=
(
float4
)
(
delta.xyz,
dGpol_dalpha2_ij*bornRadius1
)
;
}
tj
=
(
tj
+
1
)
&
(
T
ileSize
-
1
)
;
tj
=
(
tj
+
1
)
&
(
T
ILE_SIZE
-
1
)
;
}
}
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/T
ileSize
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/T
ileSize
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
+
tgx
+
(
y/T
ILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/T
ILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset1
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
warp*PADDED_NUM_ATOMS
;
...
...
@@ -483,35 +448,3 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
}
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
/**
*
Reduce
the
Born
force.
*/
__kernel
void
reduceBornForce
(
int
bufferSize,
int
numBuffers,
__global
float*
bornForce,
__global
float*
energyBuffer,
__global
float2*
params,
__global
float*
bornRadii,
__global
float*
obcChain
)
{
float
energy
=
0.0f
;
unsigned
int
index
=
get_global_id
(
0
)
;
while
(
index
<
NUM_ATOMS
)
{
//
Sum
the
Born
force
int
totalSize
=
bufferSize*numBuffers
;
float
force
=
bornForce[index]
;
for
(
int
i
=
index+bufferSize
; i < totalSize; i += bufferSize)
force
+=
bornForce[i]
;
//
Now
calculate
the
actual
force
float
offsetRadius
=
params[index].x
;
float
bornRadius
=
bornRadii[index]
;
float
r
=
offsetRadius+dielectricOffset+probeRadius
;
float
ratio6
=
pow
((
offsetRadius+dielectricOffset
)
/bornRadius,
6.0f
)
;
float
saTerm
=
surfaceAreaFactor*r*r*ratio6
;
force
+=
saTerm/bornRadius
;
energy
+=
saTerm
;
force
*=
bornRadius*bornRadius*obcChain[index]
;
bornForce[index]
=
force
;
index
+=
get_global_size
(
0
)
;
}
energyBuffer[get_global_id
(
0
)
]
+=
energy/-6.0f
;
}
platforms/opencl/src/kernels/nonbonded_nvidia.cl
View file @
95e059dc
const
unsigned
int
TileSize
=
32
;
#
define
TILE_SIZE
32
/**
*
Compute
nonbonded
interactions.
...
...
@@ -15,8 +15,8 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
#
endif
unsigned
int
totalWarps
=
get_global_size
(
0
)
/T
ileSize
;
unsigned
int
warp
=
get_global_id
(
0
)
/T
ileSize
;
unsigned
int
totalWarps
=
get_global_size
(
0
)
/T
ILE_SIZE
;
unsigned
int
warp
=
get_global_id
(
0
)
/T
ILE_SIZE
;
unsigned
int
pos
=
warp*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
float
energy
=
0.0f
;
...
...
@@ -25,10 +25,10 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*T
ileSize
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*T
ILE_SIZE
;
bool
hasExclusions
=
(
x
&
0x1
)
;
x
=
(
x>>17
)
*T
ileSize
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
T
ileSize
-1
)
;
x
=
(
x>>17
)
*T
ILE_SIZE
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
T
ILE_SIZE
-1
)
;
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
unsigned
int
atom1
=
x
+
tgx
;
float4
force
=
0.0f
;
...
...
@@ -39,12 +39,12 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
local_posq[get_local_id
(
0
)
]
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
unsigned
int
xi
=
x/T
ileSize
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/T
ileSize
-xi*
(
xi+1
)
/2
;
unsigned
int
xi
=
x/T
ILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/T
ILE_SIZE
-xi*
(
xi+1
)
/2
;
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
exclusions[exclusionIndices[tile]+tgx]
;
#
endif
for
(
unsigned
int
j
=
0
; j < T
ileSize
; j++) {
for
(
unsigned
int
j
=
0
; j < T
ILE_SIZE
; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
...
...
@@ -72,7 +72,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset
=
x
+
tgx
+
(
x/T
ileSize
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
+
tgx
+
(
x/T
ILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
#
endif
...
...
@@ -96,7 +96,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
else
{
//
Compute
only
a
subset
of
the
interactions
in
this
tile.
for
(
unsigned
int
j
=
0
; j < T
ileSize
; j++) {
for
(
unsigned
int
j
=
0
; j < T
ILE_SIZE
; j++) {
if
((
flags&
(
1<<j
))
!=
0
)
{
bool
isExcluded
=
false
;
int
atom2
=
tbx+j
;
...
...
@@ -141,15 +141,15 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
{
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/T
ileSize
;
unsigned
int
yi
=
y/T
ileSize
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/T
ileSize
-yi*
(
yi+1
)
/2
;
unsigned
int
xi
=
x/T
ILE_SIZE
;
unsigned
int
yi
=
y/T
ILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/T
ILE_SIZE
-yi*
(
yi+1
)
/2
;
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionIndices[tile]+tgx]
:
0xFFFFFFFF
)
;
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
T
ileSize
-
tgx
))
;
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
T
ILE_SIZE
-
tgx
))
;
#
endif
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < T
ileSize
; j++) {
for
(
unsigned
int
j
=
0
; j < T
ILE_SIZE
; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
...
...
@@ -174,14 +174,14 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
force.xyz
-=
delta.xyz
;
local_force[tbx+tj].xyz
+=
delta.xyz
;
excl
>>=
1
;
tj
=
(
tj
+
1
)
&
(
T
ileSize
-
1
)
;
tj
=
(
tj
+
1
)
&
(
T
ILE_SIZE
-
1
)
;
}
}
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/T
ileSize
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/T
ileSize
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
+
tgx
+
(
y/T
ILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/T
ILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset1
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
warp*PADDED_NUM_ATOMS
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment