Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
bd7cce70
Commit
bd7cce70
authored
Jun 07, 2010
by
Peter Eastman
Browse files
Minor optimizations to OpenCL
parent
803af89b
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
38 additions
and
42 deletions
+38
-42
platforms/opencl/src/OpenCLContext.cpp
platforms/opencl/src/OpenCLContext.cpp
+1
-1
platforms/opencl/src/kernels/gbsaObc2.cl
platforms/opencl/src/kernels/gbsaObc2.cl
+9
-13
platforms/opencl/src/kernels/gbsaObc_default.cl
platforms/opencl/src/kernels/gbsaObc_default.cl
+9
-9
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+15
-15
platforms/opencl/src/kernels/nonbonded_nvidia.cl
platforms/opencl/src/kernels/nonbonded_nvidia.cl
+1
-1
platforms/opencl/src/kernels/settle.cl
platforms/opencl/src/kernels/settle.cl
+3
-3
No files found.
platforms/opencl/src/OpenCLContext.cpp
View file @
bd7cce70
...
...
@@ -83,7 +83,7 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste
numAtoms
=
numParticles
;
paddedNumAtoms
=
TileSize
*
((
numParticles
+
TileSize
-
1
)
/
TileSize
);
numAtomBlocks
=
(
paddedNumAtoms
+
(
TileSize
-
1
))
/
TileSize
;
numThreadBlocks
=
4
*
device
.
getInfo
<
CL_DEVICE_MAX_COMPUTE_UNITS
>
();
numThreadBlocks
=
6
*
device
.
getInfo
<
CL_DEVICE_MAX_COMPUTE_UNITS
>
();
nonbonded
=
new
OpenCLNonbondedUtilities
(
*
this
);
posq
=
new
OpenCLArray
<
mm_float4
>
(
*
this
,
paddedNumAtoms
,
"posq"
,
true
);
velm
=
new
OpenCLArray
<
mm_float4
>
(
*
this
,
paddedNumAtoms
,
"velm"
,
true
);
...
...
platforms/opencl/src/kernels/gbsaObc2.cl
View file @
bd7cce70
...
...
@@ -3,13 +3,13 @@ if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUA
#
else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
#
endif
float
invRSquared
=
1.0f/
r2
;
float
invRSquared
=
RECIP
(
r2
)
;
float
rScaledRadiusJ
=
r+obcParams2.y
;
float
rScaledRadiusI
=
r+obcParams1.y
;
float
l_ijJ
=
1.0f/
max
(
obcParams1.x,
fabs
(
r-obcParams2.y
))
;
float
l_ijI
=
1.0f/
max
(
obcParams2.x,
fabs
(
r-obcParams1.y
))
;
float
u_ijJ
=
1.0f/
rScaledRadiusJ
;
float
u_ijI
=
1.0f/
rScaledRadiusI
;
float
l_ijJ
=
RECIP
(
max
(
obcParams1.x,
fabs
(
r-obcParams2.y
))
)
;
float
l_ijI
=
RECIP
(
max
(
obcParams2.x,
fabs
(
r-obcParams1.y
))
)
;
float
u_ijJ
=
RECIP
(
rScaledRadiusJ
)
;
float
u_ijI
=
RECIP
(
rScaledRadiusI
)
;
float
l_ij2J
=
l_ijJ*l_ijJ
;
float
l_ij2I
=
l_ijI*l_ijI
;
float
u_ij2J
=
u_ijJ*u_ijJ
;
...
...
@@ -22,12 +22,8 @@ if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
float
t3I
=
t2I*invR
;
t1J
*=
invR
;
t1I
*=
invR
;
if
(
obcParams1.x
<
rScaledRadiusJ
)
{
float
term
=
0.125f*
(
1.0f+obcParams2.y*obcParams2.y*invRSquared
)
*t3J
+
0.25f*t1J*invRSquared
;
dEdR
+=
bornForce1*term
;
}
if
(
obcParams2.x
<
rScaledRadiusJ
)
{
float
term
=
0.125f*
(
1.0f+obcParams1.y*obcParams1.y*invRSquared
)
*t3I
+
0.25f*t1I*invRSquared
;
dEdR
+=
bornForce2*term
;
}
float
term1
=
0.125f*
(
1.0f+obcParams2.y*obcParams2.y*invRSquared
)
*t3J
+
0.25f*t1J*invRSquared
;
float
term2
=
0.125f*
(
1.0f+obcParams1.y*obcParams1.y*invRSquared
)
*t3I
+
0.25f*t1I*invRSquared
;
dEdR
+=
(
obcParams1.x
<
rScaledRadiusJ
?
bornForce1*term1
:
0.0f
)
;
dEdR
+=
(
obcParams2.x
<
rScaledRadiusJ
?
bornForce2*term2
:
0.0f
)
;
}
platforms/opencl/src/kernels/gbsaObc_default.cl
View file @
bd7cce70
...
...
@@ -71,15 +71,15 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float2
params2
=
(
float2
)
(
localData[baseLocalAtom+j].radius,
localData[baseLocalAtom+j].scaledRadius
)
;
float
rScaledRadiusJ
=
r+params2.y
;
if
((
j
!=
tgx
)
&&
(
params1.x
<
rScaledRadiusJ
))
{
float
l_ij
=
1.0f/
max
(
params1.x,
fabs
(
r-params2.y
))
;
float
u_ij
=
1.0f/
rScaledRadiusJ
;
float
l_ij
=
RECIP
(
max
(
params1.x,
fabs
(
r-params2.y
))
)
;
float
u_ij
=
RECIP
(
rScaledRadiusJ
)
;
float
l_ij2
=
l_ij*l_ij
;
float
u_ij2
=
u_ij*u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
)
;
bornSum
+=
l_ij
-
u_ij
+
0.25f*r*
(
u_ij2-l_ij2
)
+
(
0.50f*invR*ratio
)
+
(
0.25f*params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
)
;
if
(
params1.x
<
params2.x-r
)
bornSum
+=
2.0f*
(
1.0f/
params1.x-l_ij
)
;
bornSum
+=
2.0f*
RECIP
(
params1.x-l_ij
)
;
}
}
}
...
...
@@ -139,27 +139,27 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float2
params2
=
(
float2
)
(
localData[baseLocalAtom+tj].radius,
localData[baseLocalAtom+tj].scaledRadius
)
;
float
rScaledRadiusJ
=
r+params2.y
;
if
(
params1.x
<
rScaledRadiusJ
)
{
float
l_ij
=
1.0f/
max
(
params1.x,
fabs
(
r-params2.y
))
;
float
u_ij
=
1.0f/
rScaledRadiusJ
;
float
l_ij
=
RECIP
(
max
(
params1.x,
fabs
(
r-params2.y
))
)
;
float
u_ij
=
RECIP
(
rScaledRadiusJ
)
;
float
l_ij2
=
l_ij*l_ij
;
float
u_ij2
=
u_ij*u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
)
;
bornSum
+=
l_ij
-
u_ij
+
0.25f*r*
(
u_ij2-l_ij2
)
+
(
0.50f*invR*ratio
)
+
(
0.25f*params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
)
;
if
(
params1.x
<
params2.x-r
)
bornSum
+=
2.0f*
(
1.0f/
params1.x-l_ij
)
;
bornSum
+=
2.0f*
RECIP
(
params1.x-l_ij
)
;
}
float
rScaledRadiusI
=
r+params1.y
;
if
(
params2.x
<
rScaledRadiusI
)
{
float
l_ij
=
1.0f/
max
(
params2.x,
fabs
(
r-params1.y
))
;
float
u_ij
=
1.0f/
rScaledRadiusI
;
float
l_ij
=
RECIP
(
max
(
params2.x,
fabs
(
r-params1.y
))
)
;
float
u_ij
=
RECIP
(
rScaledRadiusI
)
;
float
l_ij2
=
l_ij*l_ij
;
float
u_ij2
=
u_ij*u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
)
;
float
term
=
l_ij
-
u_ij
+
0.25f*r*
(
u_ij2-l_ij2
)
+
(
0.50f*invR*ratio
)
+
(
0.25f*params1.y*params1.y*invR
)
*
(
l_ij2-u_ij2
)
;
if
(
params2.x
<
params1.x-r
)
term
+=
2.0f*
(
1.0f/
params2.x-l_ij
)
;
term
+=
2.0f*
RECIP
(
params2.x-l_ij
)
;
localData[baseLocalAtom+tj+forceBufferOffset].bornSum
+=
term
;
}
}
...
...
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
View file @
bd7cce70
...
...
@@ -71,15 +71,15 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float2
params2
=
(
float2
)
(
localData[tbx+j].radius,
localData[tbx+j].scaledRadius
)
;
float
rScaledRadiusJ
=
r+params2.y
;
if
((
j
!=
tgx
)
&&
(
params1.x
<
rScaledRadiusJ
))
{
float
l_ij
=
1.0f/
max
(
params1.x,
fabs
(
r-params2.y
))
;
float
u_ij
=
1.0f/
rScaledRadiusJ
;
float
l_ij
=
RECIP
(
max
(
params1.x,
fabs
(
r-params2.y
))
)
;
float
u_ij
=
RECIP
(
rScaledRadiusJ
)
;
float
l_ij2
=
l_ij*l_ij
;
float
u_ij2
=
u_ij*u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
)
;
bornSum
+=
l_ij
-
u_ij
+
0.25f*r*
(
u_ij2-l_ij2
)
+
(
0.50f*invR*ratio
)
+
(
0.25f*params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
)
;
if
(
params1.x
<
params2.x-r
)
bornSum
+=
2.0f*
(
1.0f/
params1.x-l_ij
)
;
bornSum
+=
2.0f*
RECIP
(
params1.x-l_ij
)
;
}
}
}
...
...
@@ -136,27 +136,27 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float2
params2
=
(
float2
)
(
localData[tbx+j].radius,
localData[tbx+j].scaledRadius
)
;
float
rScaledRadiusJ
=
r+params2.y
;
if
(
params1.x
<
rScaledRadiusJ
)
{
float
l_ij
=
1.0f/
max
(
params1.x,
fabs
(
r-params2.y
))
;
float
u_ij
=
1.0f/
rScaledRadiusJ
;
float
l_ij
=
RECIP
(
max
(
params1.x,
fabs
(
r-params2.y
))
)
;
float
u_ij
=
RECIP
(
rScaledRadiusJ
)
;
float
l_ij2
=
l_ij*l_ij
;
float
u_ij2
=
u_ij*u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
)
;
bornSum
+=
l_ij
-
u_ij
+
0.25f*r*
(
u_ij2-l_ij2
)
+
(
0.50f*invR*ratio
)
+
(
0.25f*params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
)
;
if
(
params1.x
<
params2.x-r
)
bornSum
+=
2.0f*
(
1.0f/
params1.x-l_ij
)
;
bornSum
+=
2.0f*
RECIP
(
params1.x-l_ij
)
;
}
float
rScaledRadiusI
=
r+params1.y
;
if
(
params2.x
<
rScaledRadiusI
)
{
float
l_ij
=
1.0f/
max
(
params2.x,
fabs
(
r-params1.y
))
;
float
u_ij
=
1.0f/
rScaledRadiusI
;
float
l_ij
=
RECIP
(
max
(
params2.x,
fabs
(
r-params1.y
))
)
;
float
u_ij
=
RECIP
(
rScaledRadiusI
)
;
float
l_ij2
=
l_ij*l_ij
;
float
u_ij2
=
u_ij*u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
)
;
float
term
=
l_ij
-
u_ij
+
0.25f*r*
(
u_ij2-l_ij2
)
+
(
0.50f*invR*ratio
)
+
(
0.25f*params1.y*params1.y*invR
)
*
(
l_ij2-u_ij2
)
;
if
(
params2.x
<
params1.x-r
)
term
+=
2.0f*
(
1.0f/
params2.x-l_ij
)
;
term
+=
2.0f*
RECIP
(
params2.x-l_ij
)
;
tempBuffer[get_local_id
(
0
)
]
=
term
;
}
}
...
...
@@ -204,27 +204,27 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
float2
params2
=
(
float2
)
(
localData[tbx+tj].radius,
localData[tbx+tj].scaledRadius
)
;
float
rScaledRadiusJ
=
r+params2.y
;
if
(
params1.x
<
rScaledRadiusJ
)
{
float
l_ij
=
1.0f/
max
(
params1.x,
fabs
(
r-params2.y
))
;
float
u_ij
=
1.0f/
rScaledRadiusJ
;
float
l_ij
=
RECIP
(
max
(
params1.x,
fabs
(
r-params2.y
))
)
;
float
u_ij
=
RECIP
(
rScaledRadiusJ
)
;
float
l_ij2
=
l_ij*l_ij
;
float
u_ij2
=
u_ij*u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
)
;
bornSum
+=
l_ij
-
u_ij
+
0.25f*r*
(
u_ij2-l_ij2
)
+
(
0.50f*invR*ratio
)
+
(
0.25f*params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
)
;
if
(
params1.x
<
params2.x-r
)
bornSum
+=
2.0f*
(
1.0f/
params1.x-l_ij
)
;
bornSum
+=
2.0f*
RECIP
(
params1.x-l_ij
)
;
}
float
rScaledRadiusI
=
r+params1.y
;
if
(
params2.x
<
rScaledRadiusI
)
{
float
l_ij
=
1.0f/
max
(
params2.x,
fabs
(
r-params1.y
))
;
float
u_ij
=
1.0f/
rScaledRadiusI
;
float
l_ij
=
RECIP
(
max
(
params2.x,
fabs
(
r-params1.y
))
)
;
float
u_ij
=
RECIP
(
rScaledRadiusI
)
;
float
l_ij2
=
l_ij*l_ij
;
float
u_ij2
=
u_ij*u_ij
;
float
ratio
=
log
(
u_ij
/
l_ij
)
;
float
term
=
l_ij
-
u_ij
+
0.25f*r*
(
u_ij2-l_ij2
)
+
(
0.50f*invR*ratio
)
+
(
0.25f*params1.y*params1.y*invR
)
*
(
l_ij2-u_ij2
)
;
if
(
params2.x
<
params1.x-r
)
term
+=
2.0f*
(
1.0f/
params2.x-l_ij
)
;
term
+=
2.0f*
RECIP
(
params2.x-l_ij
)
;
localData[tbx+tj].bornSum
+=
term
;
}
}
...
...
platforms/opencl/src/kernels/nonbonded_nvidia.cl
View file @
bd7cce70
...
...
@@ -69,7 +69,7 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
r
=
sqrt
(
r2
)
;
float
invR
=
1.0f/r
;
float
invR
=
RECIP
(
r
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y+j
;
#
ifdef
USE_SYMMETRIC
...
...
platforms/opencl/src/kernels/settle.cl
View file @
bd7cce70
...
...
@@ -15,9 +15,9 @@ __kernel void applySettle(int numClusters, float tol, __global float4* oldPos, _
float4
xp1
=
posDelta[atoms.y]
;
float4
apos2
=
oldPos[atoms.z]
;
float4
xp2
=
posDelta[atoms.z]
;
float
m0
=
1.0f/
velm[atoms.x].w
;
float
m1
=
1.0f/
velm[atoms.y].w
;
float
m2
=
1.0f/
velm[atoms.z].w
;
float
m0
=
RECIP
(
velm[atoms.x].w
)
;
float
m1
=
RECIP
(
velm[atoms.y].w
)
;
float
m2
=
RECIP
(
velm[atoms.z].w
)
;
//
Translate
the
molecule
to
the
origin
to
improve
numerical
precision.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment