Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
95e059dc
"wrappers/python/vscode:/vscode.git/clone" did not exist on "4ad1af2a6d1c489c91d15e9267ae92530d6f41d1"
Commit
95e059dc
authored
Oct 28, 2009
by
Peter Eastman
Browse files
Created OpenCL implementation of GBSAOBCForce that works with non-Nvidia processors
parent
f0cab877
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
516 additions
and
180 deletions
+516
-180
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+32
-44
platforms/opencl/src/OpenCLKernels.h
platforms/opencl/src/OpenCLKernels.h
+1
-1
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+8
-0
platforms/opencl/src/kernels/gbsaObcReductions.cl
platforms/opencl/src/kernels/gbsaObcReductions.cl
+67
-0
platforms/opencl/src/kernels/gbsaObc_default.cl
platforms/opencl/src/kernels/gbsaObc_default.cl
+340
-0
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+49
-116
platforms/opencl/src/kernels/nonbonded_nvidia.cl
platforms/opencl/src/kernels/nonbonded_nvidia.cl
+19
-19
No files found.
platforms/opencl/src/OpenCLKernels.cpp
View file @
95e059dc
...
...
@@ -853,28 +853,15 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
defines
[
"USE_CUTOFF"
]
=
"1"
;
if
(
nb
.
getUsePeriodic
())
defines
[
"USE_PERIODIC"
]
=
"1"
;
stringstream
xsize
,
ysize
,
zsize
,
cutoffSquared
,
prefac
;
xsize
.
precision
(
8
);
ysize
.
precision
(
8
);
zsize
.
precision
(
8
);
cutoffSquared
.
precision
(
8
);
prefac
.
precision
(
8
);
xsize
<<
scientific
<<
nb
.
getPeriodicBoxSize
().
x
<<
"f"
;
ysize
<<
scientific
<<
nb
.
getPeriodicBoxSize
().
y
<<
"f"
;
zsize
<<
scientific
<<
nb
.
getPeriodicBoxSize
().
z
<<
"f"
;
cutoffSquared
<<
scientific
<<
(
nb
.
getCutoffDistance
()
*
nb
.
getCutoffDistance
())
<<
"f"
;
prefac
<<
scientific
<<
prefactor
<<
"f"
;
defines
[
"PERIODIC_BOX_SIZE_X"
]
=
xsize
.
str
();
defines
[
"PERIODIC_BOX_SIZE_Y"
]
=
ysize
.
str
();
defines
[
"PERIODIC_BOX_SIZE_Z"
]
=
zsize
.
str
();
defines
[
"CUTOFF_SQUARED"
]
=
cutoffSquared
.
str
();
defines
[
"PREFACTOR"
]
=
prefac
.
str
();
stringstream
natom
,
padded
;
natom
<<
cl
.
getNumAtoms
();
padded
<<
cl
.
getPaddedNumAtoms
();
defines
[
"NUM_ATOMS"
]
=
natom
.
str
();
defines
[
"PADDED_NUM_ATOMS"
]
=
padded
.
str
();
cl
::
Program
program
=
cl
.
createProgram
(
cl
.
loadSourceFromFile
(
"gbsaObc.cl"
),
defines
);
defines
[
"PERIODIC_BOX_SIZE_X"
]
=
doubleToString
(
nb
.
getPeriodicBoxSize
().
x
);
defines
[
"PERIODIC_BOX_SIZE_Y"
]
=
doubleToString
(
nb
.
getPeriodicBoxSize
().
y
);
defines
[
"PERIODIC_BOX_SIZE_Z"
]
=
doubleToString
(
nb
.
getPeriodicBoxSize
().
z
);
defines
[
"CUTOFF_SQUARED"
]
=
doubleToString
(
nb
.
getCutoffDistance
()
*
nb
.
getCutoffDistance
());
defines
[
"PREFACTOR"
]
=
doubleToString
(
prefactor
);
defines
[
"NUM_ATOMS"
]
=
intToString
(
cl
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
intToString
(
cl
.
getPaddedNumAtoms
());
string
filename
=
(
cl
.
getSIMDWidth
()
==
32
?
"gbsaObc_nvidia.cl"
:
"gbsaObc_default.cl"
);
cl
::
Program
program
=
cl
.
createProgram
(
cl
.
loadSourceFromFile
(
filename
),
defines
);
computeBornSumKernel
=
cl
::
Kernel
(
program
,
"computeBornSum"
);
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
bornSum
->
getDeviceBuffer
());
computeBornSumKernel
.
setArg
(
1
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
...
...
@@ -882,26 +869,16 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
computeBornSumKernel
.
setArg
(
3
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float4
),
NULL
);
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
params
->
getDeviceBuffer
());
computeBornSumKernel
.
setArg
(
5
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float2
),
NULL
);
computeBornSumKernel
.
setArg
(
6
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
if
(
nb
.
getUseCutoff
())
{
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
(
9
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
9
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
}
else
{
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
nb
.
getTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl_uint
>
(
7
,
nb
.
getTiles
().
getSize
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
nb
.
getTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl_uint
>
(
8
,
nb
.
getTiles
().
getSize
());
}
reduceBornSumKernel
=
cl
::
Kernel
(
program
,
"reduceBornSum"
);
reduceBornSumKernel
.
setArg
<
cl_int
>
(
0
,
cl
.
getPaddedNumAtoms
());
reduceBornSumKernel
.
setArg
<
cl_int
>
(
1
,
cl
.
getNumForceBuffers
());
reduceBornSumKernel
.
setArg
<
cl_float
>
(
2
,
1.0
f
);
reduceBornSumKernel
.
setArg
<
cl_float
>
(
3
,
0.8
f
);
reduceBornSumKernel
.
setArg
<
cl_float
>
(
4
,
4.85
f
);
reduceBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
bornSum
->
getDeviceBuffer
());
reduceBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
params
->
getDeviceBuffer
());
reduceBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
bornRadii
->
getDeviceBuffer
());
reduceBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
obcChain
->
getDeviceBuffer
());
force1Kernel
=
cl
::
Kernel
(
program
,
"computeGBSAForce1"
);
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getForceBuffers
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getEnergyBuffer
().
getDeviceBuffer
());
...
...
@@ -912,16 +889,27 @@ void OpenCLCalcGBSAOBCForceKernel::executeForces(ContextImpl& context) {
force1Kernel
.
setArg
(
6
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
7
,
bornForce
->
getDeviceBuffer
());
force1Kernel
.
setArg
(
8
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
force1Kernel
.
setArg
(
9
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
mm_float4
),
NULL
);
if
(
nb
.
getUseCutoff
())
{
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
9
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
10
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
11
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
force1Kernel
.
setArg
(
12
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
mm_float4
),
NULL
);
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
10
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
11
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
12
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
}
else
{
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
9
,
nb
.
getTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl_uint
>
(
1
0
,
nb
.
getTiles
().
getSize
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
10
,
nb
.
getTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl_uint
>
(
1
1
,
nb
.
getTiles
().
getSize
());
}
program
=
cl
.
createProgram
(
cl
.
loadSourceFromFile
(
"gbsaObcReductions.cl"
),
defines
);
reduceBornSumKernel
=
cl
::
Kernel
(
program
,
"reduceBornSum"
);
reduceBornSumKernel
.
setArg
<
cl_int
>
(
0
,
cl
.
getPaddedNumAtoms
());
reduceBornSumKernel
.
setArg
<
cl_int
>
(
1
,
cl
.
getNumForceBuffers
());
reduceBornSumKernel
.
setArg
<
cl_float
>
(
2
,
1.0
f
);
reduceBornSumKernel
.
setArg
<
cl_float
>
(
3
,
0.8
f
);
reduceBornSumKernel
.
setArg
<
cl_float
>
(
4
,
4.85
f
);
reduceBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
bornSum
->
getDeviceBuffer
());
reduceBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
params
->
getDeviceBuffer
());
reduceBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
bornRadii
->
getDeviceBuffer
());
reduceBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
obcChain
->
getDeviceBuffer
());
reduceBornForceKernel
=
cl
::
Kernel
(
program
,
"reduceBornForce"
);
reduceBornForceKernel
.
setArg
<
cl_int
>
(
0
,
cl
.
getPaddedNumAtoms
());
reduceBornForceKernel
.
setArg
<
cl_int
>
(
1
,
cl
.
getNumForceBuffers
());
...
...
platforms/opencl/src/OpenCLKernels.h
View file @
95e059dc
...
...
@@ -300,7 +300,7 @@ private:
class
OpenCLCalcNonbondedForceKernel
:
public
CalcNonbondedForceKernel
{
public:
OpenCLCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
OpenCLContext
&
cl
,
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
cl
(
cl
),
exceptionParams
(
NULL
),
exceptionIndices
(
NULL
)
{
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
exceptionIndices
(
NULL
)
,
cosSinSums
(
NULL
)
{
}
~
OpenCLCalcNonbondedForceKernel
();
/**
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
95e059dc
...
...
@@ -107,6 +107,14 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
if
(
cutoff
==
-
1.0
)
return
;
// There are no nonbonded interactions in the System.
if
(
atomExclusions
.
size
()
==
0
)
{
// No exclusions were specifically requested, so just mark every atom as not interacting with itself.
atomExclusions
.
resize
(
context
.
getNumAtoms
());
for
(
int
i
=
0
;
i
<
atomExclusions
.
size
();
i
++
)
atomExclusions
[
i
].
push_back
(
i
);
}
// Create the list of tiles.
int
numAtomBlocks
=
context
.
getNumAtomBlocks
();
...
...
platforms/opencl/src/kernels/gbsaObcReductions.cl
0 → 100644
View file @
95e059dc
const
float
dielectricOffset
=
0.009f
;
const
float
probeRadius
=
0.14f
;
const
float
surfaceAreaFactor
=
-6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f
;
/**
*
Reduce
the
Born
sums
to
compute
the
Born
radii.
*/
__kernel
void
reduceBornSum
(
int
bufferSize,
int
numBuffers,
float
alpha,
float
beta,
float
gamma,
__global
float*
bornSum,
__global
float2*
params,
__global
float*
bornRadii,
__global
float*
obcChain
)
{
unsigned
int
index
=
get_global_id
(
0
)
;
while
(
index
<
NUM_ATOMS
)
{
//
Get
summed
Born
data
int
totalSize
=
bufferSize*numBuffers
;
float
sum
=
bornSum[index]
;
for
(
int
i
=
index+bufferSize
; i < totalSize; i += bufferSize)
sum
+=
bornSum[i]
;
//
Now
calculate
Born
radius
and
OBC
term.
float
offsetRadius
=
params[index].x
;
sum
*=
0.5f*offsetRadius
;
float
sum2
=
sum*sum
;
float
sum3
=
sum*sum2
;
float
tanhSum
=
tanh
(
alpha*sum
-
beta*sum2
+
gamma*sum3
)
;
float
nonOffsetRadius
=
offsetRadius
+
dielectricOffset
;
float
radius
=
1.0f/
(
1.0f/offsetRadius
-
tanhSum/nonOffsetRadius
)
;
float
chain
=
offsetRadius*
(
alpha
-
2.0f*beta*sum
+
3.0f*gamma*sum2
)
;
chain
=
(
1.0f-tanhSum*tanhSum
)
*chain
/
nonOffsetRadius
;
bornRadii[index]
=
radius
;
obcChain[index]
=
chain
;
index
+=
get_global_size
(
0
)
;
}
}
/**
*
Reduce
the
Born
force.
*/
__kernel
void
reduceBornForce
(
int
bufferSize,
int
numBuffers,
__global
float*
bornForce,
__global
float*
energyBuffer,
__global
float2*
params,
__global
float*
bornRadii,
__global
float*
obcChain
)
{
float
energy
=
0.0f
;
unsigned
int
index
=
get_global_id
(
0
)
;
while
(
index
<
NUM_ATOMS
)
{
//
Sum
the
Born
force
int
totalSize
=
bufferSize*numBuffers
;
float
force
=
bornForce[index]
;
for
(
int
i
=
index+bufferSize
; i < totalSize; i += bufferSize)
force
+=
bornForce[i]
;
//
Now
calculate
the
actual
force
float
offsetRadius
=
params[index].x
;
float
bornRadius
=
bornRadii[index]
;
float
r
=
offsetRadius+dielectricOffset+probeRadius
;
float
ratio6
=
pow
((
offsetRadius+dielectricOffset
)
/bornRadius,
6.0f
)
;
float
saTerm
=
surfaceAreaFactor*r*r*ratio6
;
force
+=
saTerm/bornRadius
;
energy
+=
saTerm
;
force
*=
bornRadius*bornRadius*obcChain[index]
;
bornForce[index]
=
force
;
index
+=
get_global_size
(
0
)
;
}
energyBuffer[get_global_id
(
0
)
]
+=
energy/-6.0f
;
}
\ No newline at end of file
platforms/opencl/src/kernels/gbsaObc_default.cl
0 → 100644
View file @
95e059dc
#
define
TILE_SIZE
32
/**
*
Compute
the
Born
sum.
*/
__kernel
void
computeBornSum
(
__global
float*
global_bornSum,
__local
float*
local_bornSum,
__global
float4*
posq,
__local
float4*
local_posq,
__global
float2*
global_params,
__local
float2*
local_params,
__local
float*
tempBuffer,
__global
unsigned
int*
tiles,
#
ifdef
USE_CUTOFF
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount
)
{
#
else
unsigned
int
numTiles
)
{
#
endif
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
#
endif
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*TILE_SIZE
;
x
=
(
x>>17
)
*TILE_SIZE
;
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
forceBufferOffset
=
(
tgx
<
TILE_SIZE/2
?
0
:
TILE_SIZE
)
;
unsigned
int
atom1
=
x
+
tgx
;
float
bornSum
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float2
params1
=
global_params[atom1]
;
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
local_posq[get_local_id
(
0
)
]
=
posq1
;
local_params[get_local_id
(
0
)
]
=
params1
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*
(
xi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
float4
delta
=
(
float4
)
(
local_posq[baseLocalAtom+j].xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/PERIODIC_BOX_SIZE_X+0.5f
)
*PERIODIC_BOX_SIZE_X
;
delta.y
-=
floor
(
delta.y/PERIODIC_BOX_SIZE_Y+0.5f
)
*PERIODIC_BOX_SIZE_Y
;
delta.z
-=
floor
(
delta.z/PERIODIC_BOX_SIZE_Z+0.5f
)
*PERIODIC_BOX_SIZE_Z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y+baseLocalAtom+j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
y+baseLocalAtom+j
<
NUM_ATOMS
)
{
#
endif
float
r
=
sqrt
(
r2
)
;
float
invR
=
1.0f/r
;
float2
params2
=
local_params[baseLocalAtom+j]
;
float
rScaledRadiusJ
=
r+params2.y
;
if
((
j
!=
tgx
)
&&
(
params1.x
<
rScaledRadiusJ
))
{
float
l_ij
=
1.0f/max
(
params1.x,
fabs
(
r-params2.y
))
;
float
u_ij
=
1.0f/rScaledRadiusJ
;
float
l_ij2
=
l_ij*l_ij
;
float
u_ij2
=
u_ij*u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
)
;
bornSum
+=
l_ij
-
u_ij
+
0.25f*r*
(
u_ij2-l_ij2
)
+
(
0.50f*invR*ratio
)
+
(
0.25f*params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
)
;
if
(
params1.x
<
params2.x-r
)
bornSum
+=
2.0f*
(
1.0f/params1.x-l_ij
)
;
}
}
}
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
tempBuffer[get_local_id
(
0
)
]
=
bornSum
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset
=
x
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset
=
x
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
global_bornSum[offset]
+=
bornSum+tempBuffer[get_local_id
(
0
)
+TILE_SIZE]
;
}
}
else
{
//
This
is
an
off-diagonal
tile.
if
(
lasty
!=
y
&&
get_local_id
(
0
)
<
TILE_SIZE
)
{
unsigned
int
j
=
y
+
tgx
;
local_posq[get_local_id
(
0
)
]
=
posq[j]
;
local_params[get_local_id
(
0
)
]
=
global_params[j]
;
}
local_bornSum[get_local_id
(
0
)
]
=
0.0f
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
yi
=
y/TILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*
(
yi+1
)
/2
;
unsigned
int
tj
=
tgx%
(
TILE_SIZE/2
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
float4
delta
=
(
float4
)
(
local_posq[baseLocalAtom+tj].xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/PERIODIC_BOX_SIZE_X+0.5f
)
*PERIODIC_BOX_SIZE_X
;
delta.y
-=
floor
(
delta.y/PERIODIC_BOX_SIZE_Y+0.5f
)
*PERIODIC_BOX_SIZE_Y
;
delta.z
-=
floor
(
delta.z/PERIODIC_BOX_SIZE_Z+0.5f
)
*PERIODIC_BOX_SIZE_Z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y+baseLocalAtom+tj
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
y+baseLocalAtom+tj
<
NUM_ATOMS
)
{
#
endif
float
r
=
sqrt
(
r2
)
;
float
invR
=
1.0f/r
;
float2
params2
=
local_params[baseLocalAtom+tj]
;
float
rScaledRadiusJ
=
r+params2.y
;
if
(
params1.x
<
rScaledRadiusJ
)
{
float
l_ij
=
1.0f/max
(
params1.x,
fabs
(
r-params2.y
))
;
float
u_ij
=
1.0f/rScaledRadiusJ
;
float
l_ij2
=
l_ij*l_ij
;
float
u_ij2
=
u_ij*u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
)
;
bornSum
+=
l_ij
-
u_ij
+
0.25f*r*
(
u_ij2-l_ij2
)
+
(
0.50f*invR*ratio
)
+
(
0.25f*params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
)
;
if
(
params1.x
<
params2.x-r
)
bornSum
+=
2.0f*
(
1.0f/params1.x-l_ij
)
;
}
float
rScaledRadiusI
=
r+params1.y
;
if
(
params2.x
<
rScaledRadiusJ
)
{
float
l_ij
=
1.0f/max
(
params2.x,
fabs
(
r-params1.y
))
;
float
u_ij
=
1.0f/rScaledRadiusJ
;
float
l_ij2
=
l_ij*l_ij
;
float
u_ij2
=
u_ij*u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
)
;
float
term
=
l_ij
-
u_ij
+
0.25f*r*
(
u_ij2-l_ij2
)
+
(
0.50f*invR*ratio
)
+
(
0.25f*params1.y*params1.y*invR
)
*
(
l_ij2-u_ij2
)
;
if
(
params2.x
<
params1.x-r
)
term
+=
2.0f*
(
1.0f/params2.x-l_ij
)
;
local_bornSum[baseLocalAtom+tj+forceBufferOffset]
+=
term
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
tj
=
(
tj+1
)
%
(
TILE_SIZE/2
)
;
}
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
tempBuffer[get_local_id
(
0
)
]
=
bornSum
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset1
=
x
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
global_bornSum[offset1]
+=
bornSum+tempBuffer[get_local_id
(
0
)
+TILE_SIZE]
;
global_bornSum[offset2]
+=
local_bornSum[get_local_id
(
0
)
]+local_bornSum[get_local_id
(
0
)
+TILE_SIZE]
;
}
lasty
=
y
;
}
pos++
;
}
}
/**
*
First
part
of
computing
the
GBSA
interaction.
*/
__kernel
void
computeGBSAForce1
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__local
float4*
local_posq,
__local
float4*
local_force,
__global
float*
global_bornRadii,
__local
float*
local_bornRadii,
__global
float*
global_bornForce,
__local
float*
local_bornForce,
__local
float4*
tempBuffer,
__global
unsigned
int*
tiles,
#
ifdef
USE_CUTOFF
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount
)
{
#
else
unsigned
int
numTiles
)
{
#
endif
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
#
endif
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*TILE_SIZE
;
x
=
(
x>>17
)
*TILE_SIZE
;
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
forceBufferOffset
=
(
tgx
<
TILE_SIZE/2
?
0
:
TILE_SIZE
)
;
unsigned
int
atom1
=
x
+
tgx
;
float4
force
=
0.0f
;
float4
posq1
=
posq[atom1]
;
float
bornRadius1
=
global_bornRadii[atom1]
;
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
local_posq[get_local_id
(
0
)
]
=
posq1
;
local_bornRadii[get_local_id
(
0
)
]
=
bornRadius1
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/TILE_SIZE-xi*
(
xi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
if
(
atom1
<
NUM_ATOMS
&&
y+baseLocalAtom+j
<
NUM_ATOMS
)
{
float4
posq2
=
local_posq[baseLocalAtom+j]
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/PERIODIC_BOX_SIZE_X+0.5f
)
*PERIODIC_BOX_SIZE_X
;
delta.y
-=
floor
(
delta.y/PERIODIC_BOX_SIZE_Y+0.5f
)
*PERIODIC_BOX_SIZE_Y
;
delta.z
-=
floor
(
delta.z/PERIODIC_BOX_SIZE_Z+0.5f
)
*PERIODIC_BOX_SIZE_Z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r
=
sqrt
(
r2
)
;
float
invR
=
1.0f/r
;
float
bornRadius2
=
local_bornRadii[baseLocalAtom+j]
;
float
alpha2_ij
=
bornRadius1*bornRadius2
;
float
D_ij
=
r2/
(
4.0f*alpha2_ij
)
;
float
expTerm
=
exp
(
-D_ij
)
;
float
denominator2
=
r2
+
alpha2_ij*expTerm
;
float
denominator
=
sqrt
(
denominator2
)
;
float
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
/denominator
;
float
Gpol
=
tempEnergy/denominator2
;
float
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
float
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
#
ifdef
USE_CUTOFF
if
(
r2
>
CUTOFF_SQUARED
)
{
dEdR
=
0.0f
;
tempEnergy
=
0.0f
;
}
#
endif
energy
+=
0.5f*tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
}
}
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
tempBuffer[get_local_id
(
0
)
]
=
force
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset
=
x
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset
=
x
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
forceBuffers[offset].xyz
=
forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
global_bornForce[offset]
+=
force.w+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].w
;
}
}
else
{
//
This
is
an
off-diagonal
tile.
if
(
lasty
!=
y
&&
get_local_id
(
0
)
<
TILE_SIZE
)
{
unsigned
int
j
=
y
+
tgx
;
local_posq[get_local_id
(
0
)
]
=
posq[j]
;
local_bornRadii[get_local_id
(
0
)
]
=
global_bornRadii[j]
;
}
local_force[get_local_id
(
0
)
]
=
0.0f
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
yi
=
y/TILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*
(
yi+1
)
/2
;
unsigned
int
tj
=
tgx%
(
TILE_SIZE/2
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
if
(
atom1
<
NUM_ATOMS
&&
y+baseLocalAtom+tj
<
NUM_ATOMS
)
{
float4
posq2
=
local_posq[baseLocalAtom+tj]
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/PERIODIC_BOX_SIZE_X+0.5f
)
*PERIODIC_BOX_SIZE_X
;
delta.y
-=
floor
(
delta.y/PERIODIC_BOX_SIZE_Y+0.5f
)
*PERIODIC_BOX_SIZE_Y
;
delta.z
-=
floor
(
delta.z/PERIODIC_BOX_SIZE_Z+0.5f
)
*PERIODIC_BOX_SIZE_Z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r
=
sqrt
(
r2
)
;
float
invR
=
1.0f/r
;
float
bornRadius2
=
local_bornRadii[baseLocalAtom+tj]
;
float
alpha2_ij
=
bornRadius1*bornRadius2
;
float
D_ij
=
r2/
(
4.0f*alpha2_ij
)
;
float
expTerm
=
exp
(
-D_ij
)
;
float
denominator2
=
r2
+
alpha2_ij*expTerm
;
float
denominator
=
sqrt
(
denominator2
)
;
float
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
/denominator
;
float
Gpol
=
tempEnergy/denominator2
;
float
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
float
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
#
ifdef
USE_CUTOFF
if
(
r2
>
CUTOFF_SQUARED
)
{
dEdR
=
0.0f
;
tempEnergy
=
0.0f
;
}
#
endif
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
local_force[baseLocalAtom+tj+forceBufferOffset]
+=
(
float4
)
(
delta.xyz,
dGpol_dalpha2_ij*bornRadius1
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
tj
=
(
tj+1
)
%
(
TILE_SIZE/2
)
;
}
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
tempBuffer[get_local_id
(
0
)
]
=
force
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/TILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset1
=
x
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
forceBuffers[offset1].xyz
=
forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
forceBuffers[offset2].xyz
=
forceBuffers[offset2].xyz+local_force[get_local_id
(
0
)
].xyz+local_force[get_local_id
(
0
)
+TILE_SIZE].xyz
;
global_bornForce[offset1]
+=
force.w+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].w
;
global_bornForce[offset2]
+=
local_force[get_local_id
(
0
)
].w+local_force[get_local_id
(
0
)
+TILE_SIZE].w
;
}
lasty
=
y
;
}
pos++
;
}
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
platforms/opencl/src/kernels/gbsaObc.cl
→
platforms/opencl/src/kernels/gbsaObc
_nvidia
.cl
View file @
95e059dc
const
unsigned
int
TileSize
=
32
;
const
float
dielectricOffset
=
0.009f
;
const
float
probeRadius
=
0.14f
;
const
float
surfaceAreaFactor
=
-6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f
;
#
define
TILE_SIZE
32
/**
*
Compute
the
Born
sum.
*/
__kernel
void
computeBornSum
(
__global
float*
global_bornSum,
__local
float*
local_bornSum,
__global
float4*
posq,
__local
float4*
local_posq,
__global
float2*
global_params,
__local
float2*
local_params,
__global
unsigned
int*
tiles,
__local
float4*
local_posq,
__global
float2*
global_params,
__local
float2*
local_params,
__local
float*
tempBuffer,
__global
unsigned
int*
tiles,
#
ifdef
USE_CUTOFF
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount
,
__local
float*
tempBuffer
)
{
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount
)
{
#
else
unsigned
int
numTiles
)
{
#
endif
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
#
endif
unsigned
int
totalWarps
=
get_global_size
(
0
)
/T
ileSize
;
unsigned
int
warp
=
get_global_id
(
0
)
/T
ileSize
;
unsigned
int
totalWarps
=
get_global_size
(
0
)
/T
ILE_SIZE
;
unsigned
int
warp
=
get_global_id
(
0
)
/T
ILE_SIZE
;
unsigned
int
pos
=
warp*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
float
energy
=
0.0f
;
...
...
@@ -27,11 +24,10 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*T
ileSize
;
x
=
(
x>>17
)
*T
ileSize
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
T
ileSize
-1
)
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*T
ILE_SIZE
;
x
=
(
x>>17
)
*T
ILE_SIZE
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
T
ILE_SIZE
-1
)
;
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
unsigned
int
tj
=
tgx
;
unsigned
int
atom1
=
x
+
tgx
;
float
bornSum
=
0.0f
;
float4
posq1
=
posq[atom1]
;
...
...
@@ -41,9 +37,9 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
local_posq[get_local_id
(
0
)
]
=
posq1
;
local_params[get_local_id
(
0
)
]
=
params1
;
unsigned
int
xi
=
x/T
ileSize
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/T
ileSize
-xi*
(
xi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < T
ileSize
; j++) {
unsigned
int
xi
=
x/T
ILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/T
ILE_SIZE
-xi*
(
xi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < T
ILE_SIZE
; j++) {
float4
delta
=
(
float4
)
(
local_posq[tbx+j].xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/PERIODIC_BOX_SIZE_X+0.5f
)
*PERIODIC_BOX_SIZE_X
;
...
...
@@ -76,7 +72,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset
=
x
+
tgx
+
(
x/T
ileSize
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
+
tgx
+
(
x/T
ILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
#
endif
...
...
@@ -100,7 +96,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
else
{
//
Compute
only
a
subset
of
the
interactions
in
this
tile.
for
(
unsigned
int
j
=
0
; j < T
ileSize
; j++) {
for
(
unsigned
int
j
=
0
; j < T
ILE_SIZE
; j++) {
if
((
flags&
(
1<<j
))
!=
0
)
{
float4
delta
=
(
float4
)
(
local_posq[tbx+j].xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
...
...
@@ -166,10 +162,11 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
{
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/TileSize
;
unsigned
int
yi
=
y/TileSize
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TileSize-yi*
(
yi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < TileSize; j++) {
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
yi
=
y/TILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*
(
yi+1
)
/2
;
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
float4
delta
=
(
float4
)
(
local_posq[tbx+tj].xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/PERIODIC_BOX_SIZE_X+0.5f
)
*PERIODIC_BOX_SIZE_X
;
...
...
@@ -211,15 +208,15 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
local_bornSum[tbx+tj]
+=
term
;
}
}
tj
=
(
tj
+
1
)
&
(
T
ileSize
-
1
)
;
tj
=
(
tj
+
1
)
&
(
T
ILE_SIZE
-
1
)
;
}
}
//
Write
results
float4
of
;
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/T
ileSize
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/T
ileSize
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
+
tgx
+
(
y/T
ILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/T
ILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset1
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
warp*PADDED_NUM_ATOMS
;
...
...
@@ -232,55 +229,23 @@ __kernel void computeBornSum(__global float* global_bornSum, __local float* loca
}
}
/**
*
Reduce
the
Born
sums
to
compute
the
Born
radii.
*/
__kernel
void
reduceBornSum
(
int
bufferSize,
int
numBuffers,
float
alpha,
float
beta,
float
gamma,
__global
float*
bornSum,
__global
float2*
params,
__global
float*
bornRadii,
__global
float*
obcChain
)
{
unsigned
int
index
=
get_global_id
(
0
)
;
while
(
index
<
NUM_ATOMS
)
{
//
Get
summed
Born
data
int
totalSize
=
bufferSize*numBuffers
;
float
sum
=
bornSum[index]
;
for
(
int
i
=
index+bufferSize
; i < totalSize; i += bufferSize)
sum
+=
bornSum[i]
;
//
Now
calculate
Born
radius
and
OBC
term.
float
offsetRadius
=
params[index].x
;
sum
*=
0.5f*offsetRadius
;
float
sum2
=
sum*sum
;
float
sum3
=
sum*sum2
;
float
tanhSum
=
tanh
(
alpha*sum
-
beta*sum2
+
gamma*sum3
)
;
float
nonOffsetRadius
=
offsetRadius
+
dielectricOffset
;
float
radius
=
1.0f/
(
1.0f/offsetRadius
-
tanhSum/nonOffsetRadius
)
;
float
chain
=
offsetRadius*
(
alpha
-
2.0f*beta*sum
+
3.0f*gamma*sum2
)
;
chain
=
(
1.0f-tanhSum*tanhSum
)
*chain
/
nonOffsetRadius
;
bornRadii[index]
=
radius
;
obcChain[index]
=
chain
;
index
+=
get_global_size
(
0
)
;
}
}
/**
*
First
part
of
computing
the
GBSA
interaction.
*/
__kernel
void
computeGBSAForce1
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__local
float4*
local_posq,
__local
float4*
local_force,
__global
float*
global_bornRadii,
__local
float*
local_bornRadii,
__global
float*
global_bornForce,
__local
float*
local_bornForce,
__global
unsigned
int*
tiles,
__global
float*
global_bornForce,
__local
float*
local_bornForce,
__local
float4*
tempBuffer,
__global
unsigned
int*
tiles,
#
ifdef
USE_CUTOFF
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount
,
__local
float4*
tempBuffer
)
{
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount
)
{
#
else
unsigned
int
numTiles
)
{
#
endif
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
#
endif
unsigned
int
totalWarps
=
get_global_size
(
0
)
/T
ileSize
;
unsigned
int
warp
=
get_global_id
(
0
)
/T
ileSize
;
unsigned
int
totalWarps
=
get_global_size
(
0
)
/T
ILE_SIZE
;
unsigned
int
warp
=
get_global_id
(
0
)
/T
ILE_SIZE
;
unsigned
int
pos
=
warp*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
float
energy
=
0.0f
;
...
...
@@ -289,11 +254,10 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*T
ileSize
;
x
=
(
x>>17
)
*T
ileSize
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
T
ileSize
-1
)
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*T
ILE_SIZE
;
x
=
(
x>>17
)
*T
ILE_SIZE
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
T
ILE_SIZE
-1
)
;
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
unsigned
int
tj
=
tgx
;
unsigned
int
atom1
=
x
+
tgx
;
float4
force
=
0.0f
;
float4
posq1
=
posq[atom1]
;
...
...
@@ -303,11 +267,12 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
local_posq[get_local_id
(
0
)
]
=
posq1
;
local_bornRadii[get_local_id
(
0
)
]
=
bornRadius1
;
unsigned
int
xi
=
x/T
ileSize
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/T
ileSize
-xi*
(
xi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < T
ileSize
; j++) {
unsigned
int
xi
=
x/T
ILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/T
ILE_SIZE
-xi*
(
xi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < T
ILE_SIZE
; j++) {
if
(
atom1
<
NUM_ATOMS
&&
y+j
<
NUM_ATOMS
)
{
float4
delta
=
(
float4
)
(
local_posq[tbx+j].xyz
-
posq1.xyz,
0.0f
)
;
float4
posq2
=
local_posq[tbx+j]
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/PERIODIC_BOX_SIZE_X+0.5f
)
*PERIODIC_BOX_SIZE_X
;
delta.y
-=
floor
(
delta.y/PERIODIC_BOX_SIZE_Y+0.5f
)
*PERIODIC_BOX_SIZE_Y
;
...
...
@@ -316,7 +281,6 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r
=
sqrt
(
r2
)
;
float
invR
=
1.0f/r
;
float4
posq2
=
local_posq[tbx+j]
;
float
bornRadius2
=
local_bornRadii[tbx+j]
;
float
alpha2_ij
=
bornRadius1*bornRadius2
;
float
D_ij
=
r2/
(
4.0f*alpha2_ij
)
;
...
...
@@ -342,7 +306,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset
=
x
+
tgx
+
(
x/T
ileSize
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
+
tgx
+
(
x/T
ILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
#
endif
...
...
@@ -367,9 +331,10 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
else
{
//
Compute
only
a
subset
of
the
interactions
in
this
tile.
for
(
unsigned
int
j
=
0
; j < T
ileSize
; j++) {
for
(
unsigned
int
j
=
0
; j < T
ILE_SIZE
; j++) {
if
((
flags&
(
1<<j
))
!=
0
)
{
float4
delta
=
(
float4
)
(
local_posq[tbx+j].xyz
-
posq1.xyz,
0.0f
)
;
float4
posq2
=
local_posq[tbx+j]
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/PERIODIC_BOX_SIZE_X+0.5f
)
*PERIODIC_BOX_SIZE_X
;
delta.y
-=
floor
(
delta.y/PERIODIC_BOX_SIZE_Y+0.5f
)
*PERIODIC_BOX_SIZE_Y
;
...
...
@@ -377,8 +342,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r
=
sqrt
(
r2
)
;
float
invR
=
1.0f
/
r
;
float4
posq2
=
local_posq[tbx+j]
;
float
invR
=
1.0f/r
;
float
bornRadius2
=
local_bornRadii[tbx+j]
;
float
alpha2_ij
=
bornRadius1*bornRadius2
;
float
D_ij
=
r2/
(
4.0f*alpha2_ij
)
;
...
...
@@ -424,12 +388,14 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
{
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/TileSize
;
unsigned
int
yi
=
y/TileSize
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TileSize-yi*
(
yi+1
)
/2
;
for
(
unsigned
int
j
=
0
; j < TileSize; j++) {
unsigned
int
xi
=
x/TILE_SIZE
;
unsigned
int
yi
=
y/TILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/TILE_SIZE-yi*
(
yi+1
)
/2
;
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
if
(
atom1
<
NUM_ATOMS
&&
y+tj
<
NUM_ATOMS
)
{
float4
delta
=
(
float4
)
(
local_posq[tbx+tj].xyz
-
posq1.xyz,
0.0f
)
;
float4
posq2
=
local_posq[tbx+tj]
;
float4
delta
=
(
float4
)
(
posq2.xyz
-
posq1.xyz,
0.0f
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/PERIODIC_BOX_SIZE_X+0.5f
)
*PERIODIC_BOX_SIZE_X
;
delta.y
-=
floor
(
delta.y/PERIODIC_BOX_SIZE_Y+0.5f
)
*PERIODIC_BOX_SIZE_Y
;
...
...
@@ -437,8 +403,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r
=
sqrt
(
r2
)
;
float
invR
=
1.0f
/
r
;
float4
posq2
=
local_posq[tbx+tj]
;
float
invR
=
1.0f/r
;
float
bornRadius2
=
local_bornRadii[tbx+tj]
;
float
alpha2_ij
=
bornRadius1*bornRadius2
;
float
D_ij
=
r2/
(
4.0f*alpha2_ij
)
;
...
...
@@ -461,14 +426,14 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
force.xyz
-=
delta.xyz
;
local_force[tbx+tj]
+=
(
float4
)
(
delta.xyz,
dGpol_dalpha2_ij*bornRadius1
)
;
}
tj
=
(
tj
+
1
)
&
(
T
ileSize
-
1
)
;
tj
=
(
tj
+
1
)
&
(
T
ILE_SIZE
-
1
)
;
}
}
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/T
ileSize
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/T
ileSize
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
+
tgx
+
(
y/T
ILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/T
ILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset1
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
warp*PADDED_NUM_ATOMS
;
...
...
@@ -483,35 +448,3 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
}
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
/**
*
Reduce
the
Born
force.
*/
__kernel
void
reduceBornForce
(
int
bufferSize,
int
numBuffers,
__global
float*
bornForce,
__global
float*
energyBuffer,
__global
float2*
params,
__global
float*
bornRadii,
__global
float*
obcChain
)
{
float
energy
=
0.0f
;
unsigned
int
index
=
get_global_id
(
0
)
;
while
(
index
<
NUM_ATOMS
)
{
//
Sum
the
Born
force
int
totalSize
=
bufferSize*numBuffers
;
float
force
=
bornForce[index]
;
for
(
int
i
=
index+bufferSize
; i < totalSize; i += bufferSize)
force
+=
bornForce[i]
;
//
Now
calculate
the
actual
force
float
offsetRadius
=
params[index].x
;
float
bornRadius
=
bornRadii[index]
;
float
r
=
offsetRadius+dielectricOffset+probeRadius
;
float
ratio6
=
pow
((
offsetRadius+dielectricOffset
)
/bornRadius,
6.0f
)
;
float
saTerm
=
surfaceAreaFactor*r*r*ratio6
;
force
+=
saTerm/bornRadius
;
energy
+=
saTerm
;
force
*=
bornRadius*bornRadius*obcChain[index]
;
bornForce[index]
=
force
;
index
+=
get_global_size
(
0
)
;
}
energyBuffer[get_global_id
(
0
)
]
+=
energy/-6.0f
;
}
platforms/opencl/src/kernels/nonbonded_nvidia.cl
View file @
95e059dc
const
unsigned
int
TileSize
=
32
;
#
define
TILE_SIZE
32
/**
*
Compute
nonbonded
interactions.
...
...
@@ -15,8 +15,8 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
#
endif
unsigned
int
totalWarps
=
get_global_size
(
0
)
/T
ileSize
;
unsigned
int
warp
=
get_global_id
(
0
)
/T
ileSize
;
unsigned
int
totalWarps
=
get_global_size
(
0
)
/T
ILE_SIZE
;
unsigned
int
warp
=
get_global_id
(
0
)
/T
ILE_SIZE
;
unsigned
int
pos
=
warp*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
float
energy
=
0.0f
;
...
...
@@ -25,10 +25,10 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*T
ileSize
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
*T
ILE_SIZE
;
bool
hasExclusions
=
(
x
&
0x1
)
;
x
=
(
x>>17
)
*T
ileSize
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
T
ileSize
-1
)
;
x
=
(
x>>17
)
*T
ILE_SIZE
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
T
ILE_SIZE
-1
)
;
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
unsigned
int
atom1
=
x
+
tgx
;
float4
force
=
0.0f
;
...
...
@@ -39,12 +39,12 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
local_posq[get_local_id
(
0
)
]
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
unsigned
int
xi
=
x/T
ileSize
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/T
ileSize
-xi*
(
xi+1
)
/2
;
unsigned
int
xi
=
x/T
ILE_SIZE
;
unsigned
int
tile
=
xi+xi*PADDED_NUM_ATOMS/T
ILE_SIZE
-xi*
(
xi+1
)
/2
;
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
exclusions[exclusionIndices[tile]+tgx]
;
#
endif
for
(
unsigned
int
j
=
0
; j < T
ileSize
; j++) {
for
(
unsigned
int
j
=
0
; j < T
ILE_SIZE
; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
...
...
@@ -72,7 +72,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset
=
x
+
tgx
+
(
x/T
ileSize
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x
+
tgx
+
(
x/T
ILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
#
endif
...
...
@@ -96,7 +96,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
else
{
//
Compute
only
a
subset
of
the
interactions
in
this
tile.
for
(
unsigned
int
j
=
0
; j < T
ileSize
; j++) {
for
(
unsigned
int
j
=
0
; j < T
ILE_SIZE
; j++) {
if
((
flags&
(
1<<j
))
!=
0
)
{
bool
isExcluded
=
false
;
int
atom2
=
tbx+j
;
...
...
@@ -141,15 +141,15 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
{
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
xi
=
x/T
ileSize
;
unsigned
int
yi
=
y/T
ileSize
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/T
ileSize
-yi*
(
yi+1
)
/2
;
unsigned
int
xi
=
x/T
ILE_SIZE
;
unsigned
int
yi
=
y/T
ILE_SIZE
;
unsigned
int
tile
=
xi+yi*PADDED_NUM_ATOMS/T
ILE_SIZE
-yi*
(
yi+1
)
/2
;
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionIndices[tile]+tgx]
:
0xFFFFFFFF
)
;
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
T
ileSize
-
tgx
))
;
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
T
ILE_SIZE
-
tgx
))
;
#
endif
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < T
ileSize
; j++) {
for
(
unsigned
int
j
=
0
; j < T
ILE_SIZE
; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
...
...
@@ -174,14 +174,14 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
force.xyz
-=
delta.xyz
;
local_force[tbx+tj].xyz
+=
delta.xyz
;
excl
>>=
1
;
tj
=
(
tj
+
1
)
&
(
T
ileSize
-
1
)
;
tj
=
(
tj
+
1
)
&
(
T
ILE_SIZE
-
1
)
;
}
}
//
Write
results
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
unsigned
int
offset1
=
x
+
tgx
+
(
y/T
ileSize
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/T
ileSize
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x
+
tgx
+
(
y/T
ILE_SIZE
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
(
x/T
ILE_SIZE
)
*PADDED_NUM_ATOMS
;
#
else
unsigned
int
offset1
=
x
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
+
tgx
+
warp*PADDED_NUM_ATOMS
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment