Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
c8dac206
Commit
c8dac206
authored
Oct 17, 2012
by
Peter Eastman
Browse files
Continuing to implement double precision in OpenCL
parent
34938e2c
Changes
36
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
389 additions
and
381 deletions
+389
-381
platforms/opencl/src/kernels/customHbondForce.cl
platforms/opencl/src/kernels/customHbondForce.cl
+51
-51
platforms/opencl/src/kernels/ewald.cl
platforms/opencl/src/kernels/ewald.cl
+29
-29
platforms/opencl/src/kernels/fft.cl
platforms/opencl/src/kernels/fft.cl
+5
-5
platforms/opencl/src/kernels/findInteractingBlocks.cl
platforms/opencl/src/kernels/findInteractingBlocks.cl
+39
-39
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
+23
-23
platforms/opencl/src/kernels/harmonicAngleForce.cl
platforms/opencl/src/kernels/harmonicAngleForce.cl
+2
-2
platforms/opencl/src/kernels/harmonicBondForce.cl
platforms/opencl/src/kernels/harmonicBondForce.cl
+2
-2
platforms/opencl/src/kernels/nonbondedExceptions.cl
platforms/opencl/src/kernels/nonbondedExceptions.cl
+9
-9
platforms/opencl/src/kernels/nonbonded_cpu.cl
platforms/opencl/src/kernels/nonbonded_cpu.cl
+44
-44
platforms/opencl/src/kernels/nonbonded_default.cl
platforms/opencl/src/kernels/nonbonded_default.cl
+40
-40
platforms/opencl/src/kernels/nonbonded_nvidia.cl
platforms/opencl/src/kernels/nonbonded_nvidia.cl
+43
-43
platforms/opencl/src/kernels/periodicTorsionForce.cl
platforms/opencl/src/kernels/periodicTorsionForce.cl
+3
-3
platforms/opencl/src/kernels/pme.cl
platforms/opencl/src/kernels/pme.cl
+69
-61
platforms/opencl/src/kernels/rbTorsionForce.cl
platforms/opencl/src/kernels/rbTorsionForce.cl
+3
-3
platforms/opencl/src/kernels/torsionForce.cl
platforms/opencl/src/kernels/torsionForce.cl
+22
-22
platforms/opencl/src/kernels/utilities.cl
platforms/opencl/src/kernels/utilities.cl
+5
-5
No files found.
platforms/opencl/src/kernels/customHbondForce.cl
View file @
c8dac206
/**
*
Compute
the
difference
between
two
vectors,
setting
the
fourth
component
to
the
squared
magnitude.
*/
float
4
delta
(
float
4
vec1,
float
4
vec2
)
{
float
4
result
=
(
float
4
)
(
vec1.x-vec2.x,
vec1.y-vec2.y,
vec1.z-vec2.z,
0
.0f
)
;
real
4
delta
(
real
4
vec1,
real
4
vec2
)
{
real
4
result
=
(
real
4
)
(
vec1.x-vec2.x,
vec1.y-vec2.y,
vec1.z-vec2.z,
0
)
;
result.w
=
result.x*result.x
+
result.y*result.y
+
result.z*result.z
;
return
result
;
}
...
...
@@ -11,8 +11,8 @@ float4 delta(float4 vec1, float4 vec2) {
*
Compute
the
difference
between
two
vectors,
taking
periodic
boundary
conditions
into
account
*
and
setting
the
fourth
component
to
the
squared
magnitude.
*/
float
4
deltaPeriodic
(
float
4
vec1,
float
4
vec2,
float
4
periodicBoxSize,
float
4
invPeriodicBoxSize
)
{
float
4
result
=
(
float
4
)
(
vec1.x-vec2.x,
vec1.y-vec2.y,
vec1.z-vec2.z,
0
.0f
)
;
real
4
deltaPeriodic
(
real
4
vec1,
real
4
vec2,
real
4
periodicBoxSize,
real
4
invPeriodicBoxSize
)
{
real
4
result
=
(
real
4
)
(
vec1.x-vec2.x,
vec1.y-vec2.y,
vec1.z-vec2.z,
0
)
;
#
ifdef
USE_PERIODIC
result.x
-=
floor
(
result.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
result.y
-=
floor
(
result.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
...
...
@@ -25,15 +25,15 @@ float4 deltaPeriodic(float4 vec1, float4 vec2, float4 periodicBoxSize, float4 in
/**
*
Compute
the
angle
between
two
vectors.
The
w
component
of
each
vector
should
contain
the
squared
magnitude.
*/
float
computeAngle
(
float
4
vec1,
float
4
vec2
)
{
float
dotProduct
=
vec1.x*vec2.x
+
vec1.y*vec2.y
+
vec1.z*vec2.z
;
float
cosine
=
dotProduct*RSQRT
(
vec1.w*vec2.w
)
;
float
angle
;
real
computeAngle
(
real
4
vec1,
real
4
vec2
)
{
real
dotProduct
=
vec1.x*vec2.x
+
vec1.y*vec2.y
+
vec1.z*vec2.z
;
real
cosine
=
dotProduct*RSQRT
(
vec1.w*vec2.w
)
;
real
angle
;
if
(
cosine
>
0.99f
|
| cosine < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
float
4 crossProduct = cross(vec1, vec2);
float
scale = vec1.w*vec2.w;
real
4 crossProduct = cross(vec1, vec2);
real
scale = vec1.w*vec2.w;
angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
if (cosine < 0.0f)
angle = PI-angle;
...
...
@@ -46,8 +46,8 @@ float computeAngle(float4 vec1, float4 vec2) {
/**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/
float
4 computeCross(
float
4 vec1,
float
4 vec2) {
float
4 result = cross(vec1, vec2);
real
4 computeCross(
real
4 vec1,
real
4 vec2) {
real
4 result = cross(vec1, vec2);
result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result;
}
...
...
@@ -55,24 +55,24 @@ float4 computeCross(float4 vec1, float4 vec2) {
/**
* Compute forces on donors.
*/
__kernel void computeDonorForces(__global
float
4* restrict forceBuffers, __global
float
* restrict energyBuffer, __global const
float
4* restrict posq, __global const int4* restrict exclusions,
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict donorBufferIndices, __local
float
4* posBuffer,
float
4 periodicBoxSize,
float
4 invPeriodicBoxSize
__kernel void computeDonorForces(__global
real
4* restrict forceBuffers, __global
real
* restrict energyBuffer, __global const
real
4* restrict posq, __global const int4* restrict exclusions,
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict donorBufferIndices, __local
real
4* posBuffer,
real
4 periodicBoxSize,
real
4 invPeriodicBoxSize
PARAMETER_ARGUMENTS) {
float
energy = 0
.0f
;
float
4 f1 = (
float
4) 0;
float
4 f2 = (
float
4) 0;
float
4 f3 = (
float
4) 0;
real
energy = 0;
real
4 f1 = (
real
4) 0;
real
4 f2 = (
real
4) 0;
real
4 f3 = (
real
4) 0;
for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += get_global_size(0)) {
// Load information about the donor this thread will compute forces on.
int donorIndex = donorStart+get_global_id(0);
int4 atoms, exclusionIndices;
float
4 d1, d2, d3;
real
4 d1, d2, d3;
if (donorIndex < NUM_DONORS) {
atoms = donorAtoms[donorIndex];
d1 = (atoms.x > -1 ? posq[atoms.x] : (
float
4) 0);
d2 = (atoms.y > -1 ? posq[atoms.y] : (
float
4) 0);
d3 = (atoms.z > -1 ? posq[atoms.z] : (
float
4) 0);
d1 = (atoms.x > -1 ? posq[atoms.x] : (
real
4) 0);
d2 = (atoms.y > -1 ? posq[atoms.y] : (
real
4) 0);
d3 = (atoms.z > -1 ? posq[atoms.z] : (
real
4) 0);
#ifdef USE_EXCLUSIONS
exclusionIndices = exclusions[donorIndex];
#endif
...
...
@@ -85,9 +85,9 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa
int blockSize = min((int) get_local_size(0), NUM_ACCEPTORS-acceptorStart);
if (get_local_id(0) < blockSize) {
int4 atoms2 = acceptorAtoms[acceptorStart+get_local_id(0)];
posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (
float
4) 0);
posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (
float
4) 0);
posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (
float
4) 0);
posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (
real
4) 0);
posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (
real
4) 0);
posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (
real
4) 0);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (donorIndex < NUM_DONORS) {
...
...
@@ -99,10 +99,10 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa
#endif
// Compute the interaction between a donor and an acceptor.
float
4 a1 = posBuffer[3*index];
float
4 a2 = posBuffer[3*index+1];
float
4 a3 = posBuffer[3*index+2];
float
4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
real
4 a1 = posBuffer[3*index];
real
4 a2 = posBuffer[3*index+1];
real
4 a3 = posBuffer[3*index+2];
real
4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
#ifdef USE_CUTOFF
if (deltaD1A1.w < CUTOFF_SQUARED) {
#endif
...
...
@@ -120,19 +120,19 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa
int4 bufferIndices = donorBufferIndices[donorIndex];
if (atoms.x > -1) {
unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
float
4 force = forceBuffers[offset];
real
4 force = forceBuffers[offset];
force.xyz += f1.xyz;
forceBuffers[offset] = force;
}
if (atoms.y > -1) {
unsigned int offset = atoms.y+bufferIndices.y*PADDED_NUM_ATOMS;
float
4 force = forceBuffers[offset];
real
4 force = forceBuffers[offset];
force.xyz += f2.xyz;
forceBuffers[offset] = force;
}
if (atoms.z > -1) {
unsigned int offset = atoms.z+bufferIndices.z*PADDED_NUM_ATOMS;
float
4 force = forceBuffers[offset];
real
4 force = forceBuffers[offset];
force.xyz += f3.xyz;
forceBuffers[offset] = force;
}
...
...
@@ -143,23 +143,23 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa
/**
* Compute forces on acceptors.
*/
__kernel void computeAcceptorForces(__global
float
4* restrict forceBuffers, __global
float
* restrict energyBuffer, __global const
float
4* restrict posq, __global const int4* restrict exclusions,
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict acceptorBufferIndices, __local
float
4* restrict posBuffer,
float
4 periodicBoxSize,
float
4 invPeriodicBoxSize
__kernel void computeAcceptorForces(__global
real
4* restrict forceBuffers, __global
real
* restrict energyBuffer, __global const
real
4* restrict posq, __global const int4* restrict exclusions,
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict acceptorBufferIndices, __local
real
4* restrict posBuffer,
real
4 periodicBoxSize,
real
4 invPeriodicBoxSize
PARAMETER_ARGUMENTS) {
float
4 f1 = (
float
4) 0;
float
4 f2 = (
float
4) 0;
float
4 f3 = (
float
4) 0;
real
4 f1 = (
real
4) 0;
real
4 f2 = (
real
4) 0;
real
4 f3 = (
real
4) 0;
for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += get_global_size(0)) {
// Load information about the acceptor this thread will compute forces on.
int acceptorIndex = acceptorStart+get_global_id(0);
int4 atoms, exclusionIndices;
float
4 a1, a2, a3;
real
4 a1, a2, a3;
if (acceptorIndex < NUM_ACCEPTORS) {
atoms = acceptorAtoms[acceptorIndex];
a1 = (atoms.x > -1 ? posq[atoms.x] : (
float
4) 0);
a2 = (atoms.y > -1 ? posq[atoms.y] : (
float
4) 0);
a3 = (atoms.z > -1 ? posq[atoms.z] : (
float
4) 0);
a1 = (atoms.x > -1 ? posq[atoms.x] : (
real
4) 0);
a2 = (atoms.y > -1 ? posq[atoms.y] : (
real
4) 0);
a3 = (atoms.z > -1 ? posq[atoms.z] : (
real
4) 0);
#ifdef USE_EXCLUSIONS
exclusionIndices = exclusions[acceptorIndex];
#endif
...
...
@@ -172,9 +172,9 @@ __kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __gl
int blockSize = min((int) get_local_size(0), NUM_DONORS-donorStart);
if (get_local_id(0) < blockSize) {
int4 atoms2 = donorAtoms[donorStart+get_local_id(0)];
posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (
float
4) 0);
posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (
float
4) 0);
posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (
float
4) 0);
posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (
real
4) 0);
posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (
real
4) 0);
posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (
real
4) 0);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (acceptorIndex < NUM_ACCEPTORS) {
...
...
@@ -186,10 +186,10 @@ __kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __gl
#
endif
//
Compute
the
interaction
between
a
donor
and
an
acceptor.
float
4
d1
=
posBuffer[3*index]
;
float
4
d2
=
posBuffer[3*index+1]
;
float
4
d3
=
posBuffer[3*index+2]
;
float
4
deltaD1A1
=
deltaPeriodic
(
d1,
a1,
periodicBoxSize,
invPeriodicBoxSize
)
;
real
4
d1
=
posBuffer[3*index]
;
real
4
d2
=
posBuffer[3*index+1]
;
real
4
d3
=
posBuffer[3*index+2]
;
real
4
deltaD1A1
=
deltaPeriodic
(
d1,
a1,
periodicBoxSize,
invPeriodicBoxSize
)
;
#
ifdef
USE_CUTOFF
if
(
deltaD1A1.w
<
CUTOFF_SQUARED
)
{
#
endif
...
...
@@ -207,19 +207,19 @@ __kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __gl
int4
bufferIndices
=
acceptorBufferIndices[acceptorIndex]
;
if
(
atoms.x
>
-1
)
{
unsigned
int
offset
=
atoms.x+bufferIndices.x*PADDED_NUM_ATOMS
;
float
4
force
=
forceBuffers[offset]
;
real
4
force
=
forceBuffers[offset]
;
force.xyz
+=
f1.xyz
;
forceBuffers[offset]
=
force
;
}
if
(
atoms.y
>
-1
)
{
unsigned
int
offset
=
atoms.y+bufferIndices.y*PADDED_NUM_ATOMS
;
float
4
force
=
forceBuffers[offset]
;
real
4
force
=
forceBuffers[offset]
;
force.xyz
+=
f2.xyz
;
forceBuffers[offset]
=
force
;
}
if
(
atoms.z
>
-1
)
{
unsigned
int
offset
=
atoms.z+bufferIndices.z*PADDED_NUM_ATOMS
;
float
4
force
=
forceBuffers[offset]
;
real
4
force
=
forceBuffers[offset]
;
force.xyz
+=
f3.xyz
;
forceBuffers[offset]
=
force
;
}
...
...
platforms/opencl/src/kernels/ewald.cl
View file @
c8dac206
float
2
multof
Float2
(
float2
a,
float
2
b
)
{
return
(
float
2
)
(
a.x*b.x
-
a.y*b.y,
a.x*b.y
+
a.y*b.x
)
;
real
2
multof
Real2
(
real2
a,
real
2
b
)
{
return
(
real
2
)
(
a.x*b.x
-
a.y*b.y,
a.x*b.y
+
a.y*b.x
)
;
}
/**
*
Precompute
the
cosine
and
sine
sums
which
appear
in
each
force
term.
*/
__kernel
void
calculateEwaldCosSinSums
(
__global
float
*
restrict
energyBuffer,
__global
const
float
4*
restrict
posq,
__global
float
2*
restrict
cosSinSum,
float
4
reciprocalPeriodicBoxSize,
float
reciprocalCoefficient
)
{
__kernel
void
calculateEwaldCosSinSums
(
__global
real
*
restrict
energyBuffer,
__global
const
real
4*
restrict
posq,
__global
real
2*
restrict
cosSinSum,
real
4
reciprocalPeriodicBoxSize,
real
reciprocalCoefficient
)
{
const
unsigned
int
ksizex
=
2*KMAX_X-1
;
const
unsigned
int
ksizey
=
2*KMAX_Y-1
;
const
unsigned
int
ksizez
=
2*KMAX_Z-1
;
const
unsigned
int
totalK
=
ksizex*ksizey*ksizez
;
unsigned
int
index
=
get_global_id
(
0
)
;
float
energy
=
0.0f
;
real
energy
=
0.0f
;
while
(
index
<
(
KMAX_Y-1
)
*ksizez+KMAX_Z
)
index
+=
get_global_size
(
0
)
;
while
(
index
<
totalK
)
{
...
...
@@ -23,29 +23,29 @@ __kernel void calculateEwaldCosSinSums(__global float* restrict energyBuffer, __
int
ry
=
remainder/ksizez
;
int
rz
=
remainder
-
ry*ksizez
-
KMAX_Z
+
1
;
ry
+=
-KMAX_Y
+
1
;
float
kx
=
rx*reciprocalPeriodicBoxSize.x
;
float
ky
=
ry*reciprocalPeriodicBoxSize.y
;
float
kz
=
rz*reciprocalPeriodicBoxSize.z
;
real
kx
=
rx*reciprocalPeriodicBoxSize.x
;
real
ky
=
ry*reciprocalPeriodicBoxSize.y
;
real
kz
=
rz*reciprocalPeriodicBoxSize.z
;
//
Compute
the
sum
for
this
wave
vector.
float
2
sum
=
0.0f
;
real
2
sum
=
0.0f
;
for
(
int
atom
=
0
; atom < NUM_ATOMS; atom++) {
float
4
apos
=
posq[atom]
;
float
phase
=
apos.x*kx
;
float
2
structureFactor
=
(
float
2
)
(
cos
(
phase
)
,
sin
(
phase
))
;
real
4
apos
=
posq[atom]
;
real
phase
=
apos.x*kx
;
real
2
structureFactor
=
(
real
2
)
(
cos
(
phase
)
,
sin
(
phase
))
;
phase
=
apos.y*ky
;
structureFactor
=
multof
Float
2
(
structureFactor,
(
float
2
)
(
cos
(
phase
)
,
sin
(
phase
)))
;
structureFactor
=
multof
Real
2
(
structureFactor,
(
real
2
)
(
cos
(
phase
)
,
sin
(
phase
)))
;
phase
=
apos.z*kz
;
structureFactor
=
multof
Float
2
(
structureFactor,
(
float
2
)
(
cos
(
phase
)
,
sin
(
phase
)))
;
structureFactor
=
multof
Real
2
(
structureFactor,
(
real
2
)
(
cos
(
phase
)
,
sin
(
phase
)))
;
sum
+=
apos.w*structureFactor
;
}
cosSinSum[index]
=
sum
;
//
Compute
the
contribution
to
the
energy.
float
k2
=
kx*kx
+
ky*ky
+
kz*kz
;
float
ak
=
EXP
(
k2*EXP_COEFFICIENT
)
/
k2
;
real
k2
=
kx*kx
+
ky*ky
+
kz*kz
;
real
ak
=
EXP
(
k2*EXP_COEFFICIENT
)
/
k2
;
energy
+=
reciprocalCoefficient*ak*
(
sum.x*sum.x
+
sum.y*sum.y
)
;
index
+=
get_global_size
(
0
)
;
}
...
...
@@ -57,36 +57,36 @@ __kernel void calculateEwaldCosSinSums(__global float* restrict energyBuffer, __
*
previous
routine.
*/
__kernel
void
calculateEwaldForces
(
__global
float
4*
restrict
forceBuffers,
__global
const
float
4*
restrict
posq,
__global
const
float
2*
restrict
cosSinSum,
float
4
reciprocalPeriodicBoxSize,
float
reciprocalCoefficient
)
{
__kernel
void
calculateEwaldForces
(
__global
real
4*
restrict
forceBuffers,
__global
const
real
4*
restrict
posq,
__global
const
real
2*
restrict
cosSinSum,
real
4
reciprocalPeriodicBoxSize,
real
reciprocalCoefficient
)
{
unsigned
int
atom
=
get_global_id
(
0
)
;
while
(
atom
<
NUM_ATOMS
)
{
float
4
force
=
forceBuffers[atom]
;
float
4
apos
=
posq[atom]
;
real
4
force
=
forceBuffers[atom]
;
real
4
apos
=
posq[atom]
;
//
Loop
over
all
wave
vectors.
int
lowry
=
0
;
int
lowrz
=
1
;
for
(
int
rx
=
0
; rx < KMAX_X; rx++) {
float
kx
=
rx*reciprocalPeriodicBoxSize.x
;
real
kx
=
rx*reciprocalPeriodicBoxSize.x
;
for
(
int
ry
=
lowry
; ry < KMAX_Y; ry++) {
float
ky
=
ry*reciprocalPeriodicBoxSize.y
;
float
phase
=
apos.x*kx
;
float
2
tab_xy
=
(
float
2
)
(
cos
(
phase
)
,
sin
(
phase
))
;
real
ky
=
ry*reciprocalPeriodicBoxSize.y
;
real
phase
=
apos.x*kx
;
real
2
tab_xy
=
(
real
2
)
(
cos
(
phase
)
,
sin
(
phase
))
;
phase
=
apos.y*ky
;
tab_xy
=
multof
Float
2
(
tab_xy,
(
float
2
)
(
cos
(
phase
)
,
sin
(
phase
)))
;
tab_xy
=
multof
Real
2
(
tab_xy,
(
real
2
)
(
cos
(
phase
)
,
sin
(
phase
)))
;
for
(
int
rz
=
lowrz
; rz < KMAX_Z; rz++) {
float
kz
=
rz*reciprocalPeriodicBoxSize.z
;
real
kz
=
rz*reciprocalPeriodicBoxSize.z
;
//
Compute
the
force
contribution
of
this
wave
vector.
int
index
=
rx*
(
KMAX_Y*2-1
)
*
(
KMAX_Z*2-1
)
+
(
ry+KMAX_Y-1
)
*
(
KMAX_Z*2-1
)
+
(
rz+KMAX_Z-1
)
;
float
k2
=
kx*kx
+
ky*ky
+
kz*kz
;
float
ak
=
EXP
(
k2*EXP_COEFFICIENT
)
/k2
;
real
k2
=
kx*kx
+
ky*ky
+
kz*kz
;
real
ak
=
EXP
(
k2*EXP_COEFFICIENT
)
/k2
;
phase
=
apos.z*kz
;
float
2
structureFactor
=
multof
Float
2
(
tab_xy,
(
float
2
)
(
cos
(
phase
)
,
sin
(
phase
)))
;
float
2
sum
=
cosSinSum[index]
;
float
dEdR
=
2*reciprocalCoefficient*ak*apos.w*
(
sum.x*structureFactor.y
-
sum.y*structureFactor.x
)
;
real
2
structureFactor
=
multof
Real
2
(
tab_xy,
(
real
2
)
(
cos
(
phase
)
,
sin
(
phase
)))
;
real
2
sum
=
cosSinSum[index]
;
real
dEdR
=
2*reciprocalCoefficient*ak*apos.w*
(
sum.x*structureFactor.y
-
sum.y*structureFactor.x
)
;
force.x
+=
dEdR*kx
;
force.y
+=
dEdR*ky
;
force.z
+=
dEdR*kz
;
...
...
platforms/opencl/src/kernels/fft.cl
View file @
c8dac206
float
2
multiplyComplex
(
float
2
c1,
float
2
c2
)
{
return
(
float
2
)
(
c1.x*c2.x-c1.y*c2.y,
c1.x*c2.y+c1.y*c2.x
)
;
real
2
multiplyComplex
(
real
2
c1,
real
2
c2
)
{
return
(
real
2
)
(
c1.x*c2.x-c1.y*c2.y,
c1.x*c2.y+c1.y*c2.x
)
;
}
/**
*
Perform
a
1D
FFT
on
each
row
along
one
axis.
*/
__kernel
void
execFFT
(
__global
const
float
2*
restrict
in,
__global
float
2*
restrict
out,
floa
t
sign,
__local
float
2*
restrict
w,
__local
float
2*
restrict
data0,
__local
float
2*
restrict
data1
)
{
__kernel
void
execFFT
(
__global
const
real
2*
restrict
in,
__global
real
2*
restrict
out,
in
t
sign,
__local
real
2*
restrict
w,
__local
real
2*
restrict
data0,
__local
real
2*
restrict
data1
)
{
for
(
int
i
=
get_local_id
(
0
)
; i < ZSIZE; i += get_local_size(0))
w[i]
=
(
float
2
)
(
cos
(
-sign*i*2*M_PI/ZSIZE
)
,
sin
(
-sign*i*2*M_PI/ZSIZE
))
;
w[i]
=
(
real
2
)
(
cos
(
-sign*i*2*M_PI/ZSIZE
)
,
sin
(
-sign*i*2*M_PI/ZSIZE
))
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
index
=
get_group_id
(
0
)
; index < XSIZE*YSIZE; index += get_num_groups(0)) {
int
x
=
index/YSIZE
;
...
...
platforms/opencl/src/kernels/findInteractingBlocks.cl
View file @
c8dac206
...
...
@@ -8,19 +8,19 @@
/**
*
Find
a
bounding
box
for
the
atoms
in
each
block.
*/
__kernel
void
findBlockBounds
(
int
numAtoms,
float
4
periodicBoxSize,
float
4
invPeriodicBoxSize,
__global
const
float
4*
restrict
posq,
__global
float
4*
restrict
blockCenter,
__global
float
4*
restrict
blockBoundingBox,
__global
unsigned
int*
restrict
interactionCount
)
{
__kernel
void
findBlockBounds
(
int
numAtoms,
real
4
periodicBoxSize,
real
4
invPeriodicBoxSize,
__global
const
real
4*
restrict
posq,
__global
real
4*
restrict
blockCenter,
__global
real
4*
restrict
blockBoundingBox,
__global
unsigned
int*
restrict
interactionCount
)
{
int
index
=
get_global_id
(
0
)
;
int
base
=
index*TILE_SIZE
;
while
(
base
<
numAtoms
)
{
float
4
pos
=
posq[base]
;
real
4
pos
=
posq[base]
;
#
ifdef
USE_PERIODIC
pos.x
-=
floor
(
pos.x*invPeriodicBoxSize.x
)
*periodicBoxSize.x
;
pos.y
-=
floor
(
pos.y*invPeriodicBoxSize.y
)
*periodicBoxSize.y
;
pos.z
-=
floor
(
pos.z*invPeriodicBoxSize.z
)
*periodicBoxSize.z
;
float
4
firstPoint
=
pos
;
real
4
firstPoint
=
pos
;
#
endif
float
4
minPos
=
pos
;
float
4
maxPos
=
pos
;
real
4
minPos
=
pos
;
real
4
maxPos
=
pos
;
int
last
=
min
(
base+TILE_SIZE,
numAtoms
)
;
for
(
int
i
=
base+1
; i < last; i++) {
pos
=
posq[i]
;
...
...
@@ -46,8 +46,8 @@ __kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPe
*
to
global
memory.
*/
void
storeInteractionData
(
__local
ushort2*
buffer,
__local
int*
valid,
__local
short*
sum,
__local
ushort2*
temp,
__local
int*
baseIndex,
__global
unsigned
int*
interactionCount,
__global
ushort2*
interactingTiles,
float
cutoffSquared,
float
4
periodicBoxSize,
float
4
invPeriodicBoxSize,
__global
const
float
4*
posq,
__global
const
float
4*
blockCenter,
__global
const
float
4*
blockBoundingBox,
unsigned
int
maxTiles
)
{
__global
unsigned
int*
interactionCount,
__global
ushort2*
interactingTiles,
real
cutoffSquared,
real
4
periodicBoxSize,
real
4
invPeriodicBoxSize,
__global
const
real
4*
posq,
__global
const
real
4*
blockCenter,
__global
const
real
4*
blockBoundingBox,
unsigned
int
maxTiles
)
{
//
The
buffer
is
full,
so
we
need
to
compact
it
and
write
out
results.
Start
by
doing
a
parallel
prefix
sum.
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += GROUP_SIZE)
...
...
@@ -91,7 +91,7 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
int
index
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
int
group
=
get_local_id
(
0
)
/TILE_SIZE
;
float
4
center,
boxSize,
pos
;
real
4
center,
boxSize,
pos
;
for
(
int
tile
=
0
; tile < numValid; tile++) {
int
x
=
temp[tile].x
;
int
y
=
temp[tile].y
;
...
...
@@ -103,12 +103,12 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
#
ifdef
MAC_AMD_WORKAROUND
int
box
=
(
group
==
0
?
x
:
y
)
;
int
atom
=
(
group
==
0
?
y
:
x
)
*TILE_SIZE+index
;
__global
float
*
bc
=
(
__global
float
*
)
blockCenter
;
__global
float
*
bb
=
(
__global
float
*
)
blockBoundingBox
;
__global
float
*
ps
=
(
__global
float
*
)
posq
;
center
=
(
float
4
)
(
bc[4*box],
bc[4*box+1],
bc[4*box+2],
0
.0f
)
;
boxSize
=
(
float
4
)
(
bb[4*box],
bb[4*box+1],
bb[4*box+2],
0
.0f
)
;
pos
=
(
float
4
)
(
ps[4*atom],
ps[4*atom+1],
ps[4*atom+2],
0
.0f
)
;
__global
real
*
bc
=
(
__global
real
*
)
blockCenter
;
__global
real
*
bb
=
(
__global
real
*
)
blockBoundingBox
;
__global
real
*
ps
=
(
__global
real
*
)
posq
;
center
=
(
real
4
)
(
bc[4*box],
bc[4*box+1],
bc[4*box+2],
0
)
;
boxSize
=
(
real
4
)
(
bb[4*box],
bb[4*box+1],
bb[4*box+2],
0
)
;
pos
=
(
real
4
)
(
ps[4*atom],
ps[4*atom+1],
ps[4*atom+2],
0
)
;
#
else
center
=
blockCenter[
(
group
==
0
?
x
:
y
)
]
;
boxSize
=
blockBoundingBox[
(
group
==
0
?
x
:
y
)
]
;
...
...
@@ -117,13 +117,13 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
//
Find
the
distance
of
the
atom
from
the
bounding
box.
float
4
delta
=
pos-center
;
real
4
delta
=
pos-center
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
delta
=
max
((
float
4
)
0
.0f
,
fabs
(
delta
)
-boxSize
)
;
delta
=
max
((
real
4
)
0
,
fabs
(
delta
)
-boxSize
)
;
__local
ushort*
flag
=
(
__local
ushort*
)
&buffer[tile]
;
if
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
cutoffSquared
)
flag[group]
=
false
;
...
...
@@ -155,9 +155,9 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting.
*/
__kernel void findBlocksWithInteractions(
float
cutoffSquared,
float
4 periodicBoxSize,
float
4 invPeriodicBoxSize, __global const
float
4* restrict blockCenter,
__global const
float
4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
__global unsigned int* restrict interactionFlags, __global const
float
4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
__kernel void findBlocksWithInteractions(
real
cutoffSquared,
real
4 periodicBoxSize,
real
4 invPeriodicBoxSize, __global const
real
4* restrict blockCenter,
__global const
real
4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
__global unsigned int* restrict interactionFlags, __global const
real
4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) {
__local ushort2 buffer[BUFFER_SIZE];
__local int valid[BUFFER_SIZE];
...
...
@@ -194,26 +194,26 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
//
Find
the
distance
between
the
bounding
boxes
of
the
two
cells.
#
ifdef
MAC_AMD_WORKAROUND
__global
float
*
bc
=
(
__global
float
*
)
blockCenter
;
__global
float
*
bb
=
(
__global
float
*
)
blockBoundingBox
;
float
4
bcx
=
(
float
4
)
(
bc[4*x],
bc[4*x+1],
bc[4*x+2],
0
.0f
)
;
float
4
bcy
=
(
float
4
)
(
bc[4*y],
bc[4*y+1],
bc[4*y+2],
0
.0f
)
;
float
4
delta
=
bcx-bcy
;
float
4
boxSizea
=
(
float
4
)
(
bb[4*x],
bb[4*x+1],
bb[4*x+2],
0
.0f
)
;
float
4
boxSizeb
=
(
float
4
)
(
bb[4*y],
bb[4*y+1],
bb[4*y+2],
0
.0f
)
;
__global
real
*
bc
=
(
__global
real
*
)
blockCenter
;
__global
real
*
bb
=
(
__global
real
*
)
blockBoundingBox
;
real
4
bcx
=
(
real
4
)
(
bc[4*x],
bc[4*x+1],
bc[4*x+2],
0
)
;
real
4
bcy
=
(
real
4
)
(
bc[4*y],
bc[4*y+1],
bc[4*y+2],
0
)
;
real
4
delta
=
bcx-bcy
;
real
4
boxSizea
=
(
real
4
)
(
bb[4*x],
bb[4*x+1],
bb[4*x+2],
0
)
;
real
4
boxSizeb
=
(
real
4
)
(
bb[4*y],
bb[4*y+1],
bb[4*y+2],
0
)
;
#
else
float
4
delta
=
blockCenter[x]-blockCenter[y]
;
float
4
boxSizea
=
blockBoundingBox[x]
;
float
4
boxSizeb
=
blockBoundingBox[y]
;
real
4
delta
=
blockCenter[x]-blockCenter[y]
;
real
4
boxSizea
=
blockBoundingBox[x]
;
real
4
boxSizeb
=
blockBoundingBox[y]
;
#
endif
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
delta.x
=
max
(
0.0f
,
fabs
(
delta.x
)
-boxSizea.x-boxSizeb.x
)
;
delta.y
=
max
(
0.0f
,
fabs
(
delta.y
)
-boxSizea.y-boxSizeb.y
)
;
delta.z
=
max
(
0.0f
,
fabs
(
delta.z
)
-boxSizea.z-boxSizeb.z
)
;
delta.x
=
max
(
(
real
)
0
,
fabs
(
delta.x
)
-boxSizea.x-boxSizeb.x
)
;
delta.y
=
max
(
(
real
)
0
,
fabs
(
delta.y
)
-boxSizea.y-boxSizeb.y
)
;
delta.z
=
max
(
(
real
)
0
,
fabs
(
delta.z
)
-boxSizea.z-boxSizeb.z
)
;
if
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
cutoffSquared
)
{
//
Add
this
tile
to
the
buffer.
...
...
@@ -241,8 +241,8 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
*
Compare
each
atom
in
one
block
to
the
bounding
box
of
another
block,
and
set
*
flags
for
which
ones
are
interacting.
*/
__kernel
void
findInteractionsWithinBlocks
(
float
cutoffSquared,
float
4
periodicBoxSize,
float
4
invPeriodicBoxSize,
__global
const
float
4*
restrict
posq,
__global
const
ushort2*
restrict
tiles,
__global
const
float
4*
restrict
blockCenter,
__global
const
float
4*
restrict
blockBoundingBox,
__global
unsigned
int*
restrict
interactionFlags,
__global
const
unsigned
int*
restrict
interactionCount,
__local
volatile
unsigned
int*
restrict
flags,
unsigned
int
maxTiles
)
{
__kernel
void
findInteractionsWithinBlocks
(
real
cutoffSquared,
real
4
periodicBoxSize,
real
4
invPeriodicBoxSize,
__global
const
real
4*
restrict
posq,
__global
const
ushort2*
restrict
tiles,
__global
const
real
4*
restrict
blockCenter,
__global
const
real
4*
restrict
blockBoundingBox,
__global
unsigned
int*
restrict
interactionFlags,
__global
const
unsigned
int*
restrict
interactionCount,
__local
volatile
unsigned
int*
restrict
flags,
unsigned
int
maxTiles
)
{
unsigned
int
totalWarps
=
get_global_size
(
0
)
/TILE_SIZE
;
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
unsigned
int
numTiles
=
interactionCount[0]
;
...
...
@@ -253,7 +253,7 @@ __kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicB
if
(
numTiles
>
maxTiles
)
return
;
unsigned
int
lasty
=
0xFFFFFFFF
;
float
4
apos
;
real
4
apos
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
ushort2
tileIndices
=
tiles[pos]
;
...
...
@@ -266,20 +266,20 @@ __kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicB
else
{
//
Load
the
bounding
box
for
x
and
the
atom
positions
for
y.
float
4
center
=
blockCenter[x]
;
float
4
boxSize
=
blockBoundingBox[x]
;
real
4
center
=
blockCenter[x]
;
real
4
boxSize
=
blockBoundingBox[x]
;
if
(
y
!=
lasty
)
apos
=
posq[y*TILE_SIZE+index]
;
//
Find
the
distance
of
the
atom
from
the
bounding
box.
float
4
delta
=
apos-center
;
real
4
delta
=
apos-center
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
delta
=
max
((
float
4
)
0
.0f
,
fabs
(
delta
)
-boxSize
)
;
delta
=
max
((
real
4
)
0
,
fabs
(
delta
)
-boxSize
)
;
int
thread
=
get_local_id
(
0
)
;
flags[thread]
=
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
>
cutoffSquared
?
0
:
1
<<
index
)
;
...
...
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
View file @
c8dac206
...
...
@@ -8,19 +8,19 @@
/**
*
Find
a
bounding
box
for
the
atoms
in
each
block.
*/
__kernel
void
findBlockBounds
(
int
numAtoms,
float
4
periodicBoxSize,
float
4
invPeriodicBoxSize,
__global
const
float
4*
restrict
posq,
__global
float
4*
restrict
blockCenter,
__global
float
4*
restrict
blockBoundingBox,
__global
unsigned
int*
restrict
interactionCount
)
{
__kernel
void
findBlockBounds
(
int
numAtoms,
real
4
periodicBoxSize,
real
4
invPeriodicBoxSize,
__global
const
real
4*
restrict
posq,
__global
real
4*
restrict
blockCenter,
__global
real
4*
restrict
blockBoundingBox,
__global
unsigned
int*
restrict
interactionCount
)
{
int
index
=
get_global_id
(
0
)
;
int
base
=
index*TILE_SIZE
;
while
(
base
<
numAtoms
)
{
float
4
pos
=
posq[base]
;
real
4
pos
=
posq[base]
;
#
ifdef
USE_PERIODIC
pos.x
-=
floor
(
pos.x*invPeriodicBoxSize.x
)
*periodicBoxSize.x
;
pos.y
-=
floor
(
pos.y*invPeriodicBoxSize.y
)
*periodicBoxSize.y
;
pos.z
-=
floor
(
pos.z*invPeriodicBoxSize.z
)
*periodicBoxSize.z
;
float
4
firstPoint
=
pos
;
real
4
firstPoint
=
pos
;
#
endif
float
4
minPos
=
pos
;
float
4
maxPos
=
pos
;
real
4
minPos
=
pos
;
real
4
maxPos
=
pos
;
int
last
=
min
(
base+TILE_SIZE,
numAtoms
)
;
for
(
int
i
=
base+1
; i < last; i++) {
pos
=
posq[i]
;
...
...
@@ -46,14 +46,14 @@ __kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPe
*
to
global
memory.
*/
void
storeInteractionData
(
ushort2*
buffer,
int
numValid,
__global
unsigned
int*
interactionCount,
__global
ushort2*
interactingTiles,
__global
unsigned
int*
interactionFlags,
float
cutoffSquared,
float
4
periodicBoxSize,
float
4
invPeriodicBoxSize,
__global
float
4*
posq,
__global
float
4*
blockCenter,
__global
float
4*
blockBoundingBox,
unsigned
int
maxTiles
)
{
__global
unsigned
int*
interactionFlags,
real
cutoffSquared,
real
4
periodicBoxSize,
real
4
invPeriodicBoxSize,
__global
real
4*
posq,
__global
real
4*
blockCenter,
__global
real
4*
blockBoundingBox,
unsigned
int
maxTiles
)
{
//
Filter
the
list
of
tiles
by
comparing
the
distance
from
each
atom
to
the
other
bounding
box.
unsigned
int
flagsBuffer[2*BUFFER_SIZE]
;
float
4
atomPositions[TILE_SIZE]
;
real
4
atomPositions[TILE_SIZE]
;
int
lasty
=
-1
;
float
4
centery,
boxSizey
;
real
4
centery,
boxSizey
;
for
(
int
tile
=
0
; tile < numValid; ) {
int
x
=
buffer[tile].x
;
int
y
=
buffer[tile].y
;
...
...
@@ -64,8 +64,8 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int*
//
Load
the
atom
positions
and
bounding
boxes.
float
4
centerx
=
blockCenter[x]
;
float
4
boxSizex
=
blockBoundingBox[x]
;
real
4
centerx
=
blockCenter[x]
;
real
4
boxSizex
=
blockBoundingBox[x]
;
if
(
y
!=
lasty
)
{
for
(
int
atom
=
0
; atom < TILE_SIZE; atom++)
atomPositions[atom]
=
posq[y*TILE_SIZE+atom]
;
...
...
@@ -78,18 +78,18 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int*
unsigned
int
flags1
=
0
,
flags2
=
0
;
for
(
int
atom
=
0
; atom < TILE_SIZE; atom++) {
float
4
delta
=
atomPositions[atom]-centerx
;
real
4
delta
=
atomPositions[atom]-centerx
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
delta
=
max
((
float
4
)
0
.0f
,
fabs
(
delta
)
-boxSizex
)
;
delta
=
max
((
real
4
)
0
,
fabs
(
delta
)
-boxSizex
)
;
if
(
dot
(
delta.xyz,
delta.xyz
)
<
cutoffSquared
)
flags1
+=
1
<<
atom
;
delta
=
posq[x*TILE_SIZE+atom]-centery
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
delta
=
max
((
float
4
)
0
.0f
,
fabs
(
delta
)
-boxSizey
)
;
delta
=
max
((
real
4
)
0
,
fabs
(
delta
)
-boxSizey
)
;
if
(
dot
(
delta.xyz,
delta.xyz
)
<
cutoffSquared
)
flags2
+=
1
<<
atom
;
}
...
...
@@ -121,9 +121,9 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int*
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting.
*/
__kernel void findBlocksWithInteractions(
float
cutoffSquared,
float
4 periodicBoxSize,
float
4 invPeriodicBoxSize, __global const
float
4* restrict blockCenter,
__global const
float
4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
__global unsigned int* restrict interactionFlags, __global const
float
4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
__kernel void findBlocksWithInteractions(
real
cutoffSquared,
real
4 periodicBoxSize,
real
4 invPeriodicBoxSize, __global const
real
4* restrict blockCenter,
__global const
real
4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
__global unsigned int* restrict interactionFlags, __global const
real
4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) {
ushort2 buffer[BUFFER_SIZE];
int valuesInBuffer = 0;
...
...
@@ -142,17 +142,17 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
//
Find
the
distance
between
the
bounding
boxes
of
the
two
cells.
float
4
delta
=
blockCenter[x]-blockCenter[y]
;
real
4
delta
=
blockCenter[x]-blockCenter[y]
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
float
4
boxSizea
=
blockBoundingBox[x]
;
float
4
boxSizeb
=
blockBoundingBox[y]
;
delta.x
=
max
(
0.0f
,
fabs
(
delta.x
)
-boxSizea.x-boxSizeb.x
)
;
delta.y
=
max
(
0.0f
,
fabs
(
delta.y
)
-boxSizea.y-boxSizeb.y
)
;
delta.z
=
max
(
0.0f
,
fabs
(
delta.z
)
-boxSizea.z-boxSizeb.z
)
;
real
4
boxSizea
=
blockBoundingBox[x]
;
real
4
boxSizeb
=
blockBoundingBox[y]
;
delta.x
=
max
(
(
real
)
0
,
fabs
(
delta.x
)
-boxSizea.x-boxSizeb.x
)
;
delta.y
=
max
(
(
real
)
0
,
fabs
(
delta.y
)
-boxSizea.y-boxSizeb.y
)
;
delta.z
=
max
(
(
real
)
0
,
fabs
(
delta.z
)
-boxSizea.z-boxSizeb.z
)
;
if
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
cutoffSquared
)
{
//
Add
this
tile
to
the
buffer.
...
...
platforms/opencl/src/kernels/harmonicAngleForce.cl
View file @
c8dac206
float2
angleParams
=
PARAMS[index]
;
float
deltaIdeal
=
theta-angleParams.x
;
real
deltaIdeal
=
theta-angleParams.x
;
energy
+=
0.5f*angleParams.y*deltaIdeal*deltaIdeal
;
float
dEdAngle
=
angleParams.y*deltaIdeal
;
real
dEdAngle
=
angleParams.y*deltaIdeal
;
platforms/opencl/src/kernels/harmonicBondForce.cl
View file @
c8dac206
float2
bondParams
=
PARAMS[index]
;
float
deltaIdeal
=
r-bondParams.x
;
real
deltaIdeal
=
r-bondParams.x
;
energy
+=
0.5f
*
bondParams.y*deltaIdeal*deltaIdeal
;
float
dEdR
=
bondParams.y
*
deltaIdeal
;
real
dEdR
=
bondParams.y
*
deltaIdeal
;
platforms/opencl/src/kernels/nonbondedExceptions.cl
View file @
c8dac206
float4
exceptionParams
=
PARAMS[index]
;
float
4
delta
=
pos2-pos1
;
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
invR
=
RSQRT
(
r2
)
;
float
sig2
=
invR*exceptionParams.y
;
real
4
delta
=
pos2-pos1
;
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
real
invR
=
RSQRT
(
r2
)
;
real
sig2
=
invR*exceptionParams.y
;
sig2
*=
sig2
;
float
sig6
=
sig2*sig2*sig2
;
float
dEdR
=
exceptionParams.z*
(
12.0f*sig6-6.0f
)
*sig6
;
float
tempEnergy
=
exceptionParams.z*
(
sig6-1.0f
)
*sig6
;
real
sig6
=
sig2*sig2*sig2
;
real
dEdR
=
exceptionParams.z*
(
12.0f*sig6-6.0f
)
*sig6
;
real
tempEnergy
=
exceptionParams.z*
(
sig6-1.0f
)
*sig6
;
dEdR
+=
exceptionParams.x*invR
;
dEdR
*=
invR*invR
;
tempEnergy
+=
exceptionParams.x*invR
;
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
float
4
force1
=
-delta
;
float
4
force2
=
delta
;
real
4
force1
=
-delta
;
real
4
force2
=
delta
;
platforms/opencl/src/kernels/nonbonded_cpu.cl
View file @
c8dac206
#
define
TILE_SIZE
32
typedef
struct
{
float
x,
y,
z
;
float
q
;
float
fx,
fy,
fz
;
real
x,
y,
z
;
real
q
;
real
fx,
fy,
fz
;
ATOM_PARAMETER_DATA
}
AtomData
;
...
...
@@ -11,11 +11,11 @@ typedef struct {
*
Compute
nonbonded
interactions.
*/
__kernel
void
computeNonbonded
(
__global
float
4*
restrict
forceBuffers,
__global
float
*
restrict
energyBuffer,
__global
const
float
4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__kernel
void
computeNonbonded
(
__global
real
4*
restrict
forceBuffers,
__global
real
*
restrict
energyBuffer,
__global
const
real
4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float
4
periodicBoxSize,
float
4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real
4
periodicBoxSize,
real
4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
#
else
unsigned
int
numTiles
#
endif
...
...
@@ -28,7 +28,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
unsigned
int
pos
=
startTileIndex+get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
startTileIndex+
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
float
energy
=
0
.0f
;
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
AtomData
localData[TILE_SIZE]
;
...
...
@@ -71,7 +71,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
if (lasty != y) {
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex;
float
4 tempPosq = posq[j];
real
4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
...
...
@@ -87,34 +87,34 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
unsigned int excl = exclusions[exclusionIndex+tgx];
#endif
unsigned int atom1 = x*TILE_SIZE+tgx;
float
4 force = 0
.0f
;
float
4 posq1 = posq[atom1];
real
4 force = 0;
real
4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
float
4 posq2 = (
float
4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
float
4 delta = (
float
4) (posq2.xyz - posq1.xyz, 0
.0f
);
real
4 posq2 = (
real
4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real
4 delta = (
real
4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float
r2 = dot(delta.xyz, delta.xyz);
real
r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float
invR = RSQRT(r2);
float
r = RECIP(invR);
real
invR = RSQRT(r2);
real
r = RECIP(invR);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
float
dEdR = 0
.0f
;
real
dEdR = 0;
#else
float
4 dEdR1 = (
float
4) 0
.0f
;
float
4 dEdR2 = (
float
4) 0
.0f
;
real
4 dEdR1 = (
real
4) 0;
real
4 dEdR2 = (
real
4) 0;
#endif
float
tempEnergy = 0
.0f
;
real
tempEnergy = 0;
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC
...
...
@@ -138,9 +138,9 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
// This is an off-diagonal tile.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
localData[tgx].fx = 0
.0f
;
localData[tgx].fy = 0
.0f
;
localData[tgx].fz = 0
.0f
;
localData[tgx].fx = 0;
localData[tgx].fy = 0;
localData[tgx].fz = 0;
}
#ifdef USE_CUTOFF
unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF);
...
...
@@ -151,31 +151,31 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
if
((
flags2&
(
1<<tgx
))
!=
0
)
{
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
float
4
force
=
0
.0f
;
float
4
posq1
=
posq[atom1]
;
real
4
force
=
0
;
real
4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
if
((
flags1&
(
1<<j
))
!=
0
)
{
bool
isExcluded
=
false
;
float
4
posq2
=
(
float
4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
float
4
delta
=
(
float
4
)
(
posq2.xyz
-
posq1.xyz,
0
.0f
)
;
real
4
posq2
=
(
real
4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
real
4
delta
=
(
real
4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
float
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
if
(
r2
<
CUTOFF_SQUARED
)
{
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
#
ifdef
USE_SYMMETRIC
float
dEdR
=
0
.0f
;
real
dEdR
=
0
;
#
else
float
4
dEdR1
=
(
float
4
)
0
.0f
;
float
4
dEdR2
=
(
float
4
)
0
.0f
;
real
4
dEdR1
=
(
real
4
)
0
;
real
4
dEdR2
=
(
real
4
)
0
;
#
endif
float
tempEnergy
=
0
.0f
;
real
tempEnergy
=
0
;
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
#
ifdef
USE_SYMMETRIC
...
...
@@ -208,8 +208,8 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
float
4
force
=
0
.0f
;
float
4
posq1
=
posq[atom1]
;
real
4
force
=
0
;
real
4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionIndex+tgx]
:
0xFFFFFFFF
)
;
...
...
@@ -218,27 +218,27 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
float
4
posq2
=
(
float
4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
float
4
delta
=
(
float
4
)
(
posq2.xyz
-
posq1.xyz,
0
.0f
)
;
real
4
posq2
=
(
real
4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
real
4
delta
=
(
real
4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
float
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
#
ifdef
USE_SYMMETRIC
float
dEdR
=
0
.0f
;
real
dEdR
=
0
;
#
else
float
4
dEdR1
=
(
float
4
)
0
.0f
;
float
4
dEdR2
=
(
float
4
)
0
.0f
;
real
4
dEdR1
=
(
real
4
)
0
;
real
4
dEdR2
=
(
real
4
)
0
;
#
endif
float
tempEnergy
=
0
.0f
;
real
tempEnergy
=
0
;
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
#
ifdef
USE_SYMMETRIC
...
...
@@ -272,7 +272,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
offset
=
y*TILE_SIZE+tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
float
4
f
=
forceBuffers[offset]
;
real
4
f
=
forceBuffers[offset]
;
f.x
+=
localData[tgx].fx
;
f.y
+=
localData[tgx].fy
;
f.z
+=
localData[tgx].fz
;
...
...
platforms/opencl/src/kernels/nonbonded_default.cl
View file @
c8dac206
...
...
@@ -10,16 +10,16 @@
//
aligned
which
wastes
space
and
causes
LDS
bank
conflicts
as
stride
is
no
//
longer
odd
DWORDS.
typedef
struct
{
float
x,
y,
z
;
}
Unaligned
Float
3
;
real
x,
y,
z
;
}
Unaligned
Real
3
;
typedef
struct
{
float
x,
y,
z
;
float
q
;
float
fx,
fy,
fz
;
real
x,
y,
z
;
real
q
;
real
fx,
fy,
fz
;
ATOM_PARAMETER_DATA
#
ifndef
PARAMETER_SIZE_IS_EVEN
float
padding
;
real
padding
;
#
endif
}
AtomData
;
...
...
@@ -32,13 +32,13 @@ void computeNonbonded(
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
forceBuffers,
#
else
__global
float
4*
restrict
forceBuffers,
__global
real
4*
restrict
forceBuffers,
#
endif
__global
float
*
restrict
energyBuffer,
__global
const
float
4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
real
*
restrict
energyBuffer,
__global
const
real
4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float
4
periodicBoxSize,
float
4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real
4
periodicBoxSize,
real
4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
#
else
unsigned
int
numTiles
#
endif
...
...
@@ -51,10 +51,10 @@ void computeNonbonded(
unsigned
int
pos
=
startTileIndex+get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
startTileIndex+
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
float
energy
=
0
.0f
;
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
AtomData
localData[TILE_SIZE]
;
__local
Unaligned
Float
3
localForce[FORCE_WORK_GROUP_SIZE]
;
__local
Unaligned
Real
3
localForce[FORCE_WORK_GROUP_SIZE]
;
#
ifdef
USE_EXCLUSIONS
__local
unsigned
int
exclusionRange[2]
;
__local
int
exclusionIndex[1]
;
...
...
@@ -83,8 +83,8 @@ void computeNonbonded(
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
unsigned int atom1 = x*TILE_SIZE + tgx;
float
4 force = 0
.0f
;
float
4 posq1 = posq[atom1];
real
4 force = 0;
real
4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
...
...
@@ -121,25 +121,25 @@ void computeNonbonded(
bool isExcluded = !(excl & 0x1);
#endif
unsigned int atom2 = baseLocalAtom+j;
float
4 posq2 = (
float
4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float
4 delta = (
float
4) (posq2.xyz - posq1.xyz, 0
.0f
);
real
4 posq2 = (
real
4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real
4 delta = (
real
4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float
r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float
invR = RSQRT(r2);
float
r = RECIP(invR);
real
r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real
invR = RSQRT(r2);
real
r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+baseLocalAtom+j;
#ifdef USE_SYMMETRIC
float
dEdR = 0
.0f
;
real
dEdR = 0;
#else
float
4 dEdR1 = (
float
4) 0
.0f
;
float
4 dEdR2 = (
float
4) 0
.0f
;
real
4 dEdR1 = (
real
4) 0;
real
4 dEdR2 = (
real
4) 0;
#endif
float
tempEnergy = 0
.0f
;
real
tempEnergy = 0;
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC
...
...
@@ -173,8 +173,8 @@ void computeNonbonded(
#else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
// Cheaper to load/store
float
4 than
float
3.
float
4 sum = forceBuffers[offset];
// Cheaper to load/store
real
4 than
real
3.
real
4 sum = forceBuffers[offset];
sum.xyz += force.xyz;
forceBuffers[offset] = sum;
#endif
...
...
@@ -187,16 +187,16 @@ void computeNonbonded(
if (lasty != y && get_local_id(0) < TILE_SIZE) {
const unsigned int localAtomIndex = tgx;
unsigned int j = y*TILE_SIZE + tgx;
float
4 tempPosq = posq[j];
real
4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localForce[get_local_id(0)].x = 0
.0f
;
localForce[get_local_id(0)].y = 0
.0f
;
localForce[get_local_id(0)].z = 0
.0f
;
localForce[get_local_id(0)].x = 0;
localForce[get_local_id(0)].y = 0;
localForce[get_local_id(0)].z = 0;
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
...
...
@@ -210,26 +210,26 @@ void computeNonbonded(
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
float
4
posq2
=
(
float
4
)
(
localData[tj].x,
localData[tj].y,
localData[tj].z,
localData[tj].q
)
;
float
4
delta
=
(
float
4
)
(
posq2.xyz
-
posq1.xyz,
0
.0f
)
;
real
4
posq2
=
(
real
4
)
(
localData[tj].x,
localData[tj].y,
localData[tj].z,
localData[tj].q
)
;
real
4
delta
=
(
real
4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
int
atom2
=
tj
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+tj
;
#
ifdef
USE_SYMMETRIC
float
dEdR
=
0
.0f
;
real
dEdR
=
0
;
#
else
float
4
dEdR1
=
(
float
4
)
0
.0f
;
float
4
dEdR2
=
(
float
4
)
0
.0f
;
real
4
dEdR1
=
(
real
4
)
0
;
real
4
dEdR2
=
(
real
4
)
0
;
#
endif
float
tempEnergy
=
0
.0f
;
real
tempEnergy
=
0
;
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
#
ifdef
USE_SYMMETRIC
...
...
@@ -277,9 +277,9 @@ void computeNonbonded(
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
//
Cheaper
to
load/store
float
4
than
float
3.
Do
all
loads
before
all
stores
to
minimize
store-load
waits.
float
4
sum1
=
forceBuffers[offset1]
;
float
4
sum2
=
forceBuffers[offset2]
;
//
Cheaper
to
load/store
real
4
than
real
3.
Do
all
loads
before
all
stores
to
minimize
store-load
waits.
real
4
sum1
=
forceBuffers[offset1]
;
real
4
sum2
=
forceBuffers[offset2]
;
sum1.x
+=
localData[tgx].fx
+
force.x
;
sum1.y
+=
localData[tgx].fy
+
force.y
;
sum1.z
+=
localData[tgx].fz
+
force.z
;
...
...
platforms/opencl/src/kernels/nonbonded_nvidia.cl
View file @
c8dac206
...
...
@@ -6,12 +6,12 @@
#
define
WARPS_PER_GROUP
(
FORCE_WORK_GROUP_SIZE/TILE_SIZE
)
typedef
struct
{
float
x,
y,
z
;
float
q
;
float
fx,
fy,
fz
;
real
x,
y,
z
;
real
q
;
real
fx,
fy,
fz
;
ATOM_PARAMETER_DATA
#
ifndef
PARAMETER_SIZE_IS_EVEN
float
padding
;
real
padding
;
#
endif
}
AtomData
;
...
...
@@ -22,13 +22,13 @@ __kernel void computeNonbonded(
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
forceBuffers,
#
else
__global
float
4*
restrict
forceBuffers,
__global
real
4*
restrict
forceBuffers,
#
endif
__global
float
*
restrict
energyBuffer,
__global
const
float
4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
real
*
restrict
energyBuffer,
__global
const
real
4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float
4
periodicBoxSize,
float
4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real
4
periodicBoxSize,
real
4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
#
else
unsigned
int
numTiles
#
endif
...
...
@@ -43,9 +43,9 @@ __kernel void computeNonbonded(
unsigned
int
pos
=
startTileIndex+warp*numTiles/totalWarps
;
unsigned
int
end
=
startTileIndex+
(
warp+1
)
*numTiles/totalWarps
;
#
endif
float
energy
=
0
.0f
;
real
energy
=
0
;
__local
AtomData
localData[FORCE_WORK_GROUP_SIZE]
;
__local
float
tempBuffer[3*FORCE_WORK_GROUP_SIZE]
;
__local
real
tempBuffer[3*FORCE_WORK_GROUP_SIZE]
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
...
...
@@ -56,7 +56,7 @@ __kernel void computeNonbonded(
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
unsigned
int
x,
y
;
float
4
force
=
0
.0f
;
real
4
force
=
0
;
if
(
pos
<
end
)
{
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
...
...
@@ -75,7 +75,7 @@ __kernel void computeNonbonded(
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
float
4 posq1 = posq[atom1];
real
4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
...
...
@@ -111,25 +111,25 @@ __kernel void computeNonbonded(
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
float
4 posq2 = (
float
4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float
4 delta = (
float
4) (posq2.xyz - posq1.xyz, 0
.0f
);
real
4 posq2 = (
real
4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real
4 delta = (
real
4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float
r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float
invR = RSQRT(r2);
float
r = RECIP(invR);
real
r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real
invR = RSQRT(r2);
real
r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
float
dEdR = 0
.0f
;
real
dEdR = 0;
#else
float
4 dEdR1 = (
float
4) 0
.0f
;
float
4 dEdR2 = (
float
4) 0
.0f
;
real
4 dEdR1 = (
real
4) 0;
real
4 dEdR2 = (
real
4) 0;
#endif
float
tempEnergy = 0
.0f
;
real
tempEnergy = 0;
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC
...
...
@@ -147,15 +147,15 @@ __kernel void computeNonbonded(
const unsigned int localAtomIndex = get_local_id(0);
unsigned int j = y*TILE_SIZE + tgx;
float
4 tempPosq = posq[j];
real
4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].fx = 0
.0f
;
localData[localAtomIndex].fy = 0
.0f
;
localData[localAtomIndex].fz = 0
.0f
;
localData[localAtomIndex].fx = 0;
localData[localAtomIndex].fy = 0;
localData[localAtomIndex].fz = 0;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
...
...
@@ -171,25 +171,25 @@ __kernel void computeNonbonded(
int atom2 = tbx+j;
int bufferIndex = 3*get_local_id(0);
#ifdef USE_SYMMETRIC
float
dEdR = 0
.0f
;
real
dEdR = 0;
#else
float
4 dEdR1 = (
float
4) 0
.0f
;
float
4 dEdR2 = (
float
4) 0
.0f
;
real
4 dEdR1 = (
real
4) 0;
real
4 dEdR2 = (
real
4) 0;
#endif
float
tempEnergy = 0
.0f
;
float
4 posq2 = (
float
4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float
4 delta = (
float
4) (posq2.xyz - posq1.xyz, 0
.0f
);
real
tempEnergy = 0;
real
4 posq2 = (
real
4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real
4 delta = (
real
4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float
r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real
r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float
invR = RSQRT(r2);
float
r = RECIP(invR);
real
invR = RSQRT(r2);
real
r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
COMPUTE_INTERACTION
...
...
@@ -241,28 +241,28 @@ __kernel void computeNonbonded(
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
int
atom2
=
tbx+tj
;
float
4
posq2
=
(
float
4
)
(
localData[atom2].x,
localData[atom2].y,
localData[atom2].z,
localData[atom2].q
)
;
float
4
delta
=
(
float
4
)
(
posq2.xyz
-
posq1.xyz,
0
.0f
)
;
real
4
posq2
=
(
real
4
)
(
localData[atom2].x,
localData[atom2].y,
localData[atom2].z,
localData[atom2].q
)
;
real
4
delta
=
(
real
4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
float
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
float
invR
=
RSQRT
(
r2
)
;
float
r
=
RECIP
(
invR
)
;
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+tj
;
#
ifdef
USE_SYMMETRIC
float
dEdR
=
0
.0f
;
real
dEdR
=
0
;
#
else
float
4
dEdR1
=
(
float
4
)
0
.0f
;
float
4
dEdR2
=
(
float
4
)
0
.0f
;
real
4
dEdR1
=
(
real
4
)
0
;
real
4
dEdR2
=
(
real
4
)
0
;
#
endif
float
tempEnergy
=
0
.0f
;
real
tempEnergy
=
0
;
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
#
ifdef
USE_SYMMETRIC
...
...
@@ -347,7 +347,7 @@ __kernel void computeNonbonded(
}
if
(
writeY
>
-1
)
{
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset]
+=
(
float
4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0
.0f
)
;
forceBuffers[offset]
+=
(
real
4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0
)
;
}
done
=
true
;
if
(
tgx
==
0
)
...
...
platforms/opencl/src/kernels/periodicTorsionForce.cl
View file @
c8dac206
float4
torsionParams
=
PARAMS[index]
;
float
deltaAngle
=
torsionParams.z*theta-torsionParams.y
;
real
deltaAngle
=
torsionParams.z*theta-torsionParams.y
;
energy
+=
torsionParams.x*
(
1.0f+cos
(
deltaAngle
))
;
float
sinDeltaAngle
=
sin
(
deltaAngle
)
;
float
dEdAngle
=
-torsionParams.x*torsionParams.z*sinDeltaAngle
;
real
sinDeltaAngle
=
sin
(
deltaAngle
)
;
real
dEdAngle
=
-torsionParams.x*torsionParams.z*sinDeltaAngle
;
platforms/opencl/src/kernels/pme.cl
View file @
c8dac206
__kernel
void
updateBsplines
(
__global
const
float
4*
restrict
posq,
__global
float
4*
restrict
pmeBsplineTheta,
__local
float
4*
restrict
bsplinesCache,
__global
int2*
restrict
pmeAtomGridIndex,
float
4
periodicBoxSize,
float
4
invPeriodicBoxSize
)
{
const
float
4
scale
=
1
.0f/
(
PME_ORDER-1
)
;
__kernel
void
updateBsplines
(
__global
const
real
4*
restrict
posq,
__global
real
4*
restrict
pmeBsplineTheta,
__local
real
4*
restrict
bsplinesCache,
__global
int2*
restrict
pmeAtomGridIndex,
real
4
periodicBoxSize,
real
4
invPeriodicBoxSize
)
{
const
real
4
scale
=
1
/
(
real
)
(
PME_ORDER-1
)
;
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0)) {
__local
float
4*
data
=
&bsplinesCache[get_local_id
(
0
)
*PME_ORDER]
;
float
4
pos
=
posq[i]
;
__local
real
4*
data
=
&bsplinesCache[get_local_id
(
0
)
*PME_ORDER]
;
real
4
pos
=
posq[i]
;
pos.x
-=
floor
(
pos.x*invPeriodicBoxSize.x
)
*periodicBoxSize.x
;
pos.y
-=
floor
(
pos.y*invPeriodicBoxSize.y
)
*periodicBoxSize.y
;
pos.z
-=
floor
(
pos.z*invPeriodicBoxSize.z
)
*periodicBoxSize.z
;
float
4
t
=
(
float
4
)
((
pos.x*invPeriodicBoxSize.x
)
*GRID_SIZE_X,
real
4
t
=
(
real
4
)
((
pos.x*invPeriodicBoxSize.x
)
*GRID_SIZE_X,
(
pos.y*invPeriodicBoxSize.y
)
*GRID_SIZE_Y,
(
pos.z*invPeriodicBoxSize.z
)
*GRID_SIZE_Z,
0.0f
)
;
float
4
dr
=
(
float
4
)
(
t.x-
(
int
)
t.x,
t.y-
(
int
)
t.y,
t.z-
(
int
)
t.z,
0.0f
)
;
real
4
dr
=
(
real
4
)
(
t.x-
(
int
)
t.x,
t.y-
(
int
)
t.y,
t.z-
(
int
)
t.z,
0.0f
)
;
int4
gridIndex
=
(
int4
)
(((
int
)
t.x
)
%
GRID_SIZE_X,
((
int
)
t.y
)
%
GRID_SIZE_Y,
((
int
)
t.z
)
%
GRID_SIZE_Z,
0
)
;
...
...
@@ -19,15 +19,15 @@ __kernel void updateBsplines(__global const float4* restrict posq, __global floa
data[1]
=
dr
;
data[0]
=
1.0f-dr
;
for
(
int
j
=
3
; j < PME_ORDER; j++) {
float
div
=
1.0f/
(
j-1.0f
)
;
real
div
=
RECIP
(
j-1.0f
)
;
data[j-1]
=
div*dr*data[j-2]
;
for
(
int
k
=
1
; k < (j-1); k++)
data[j-k-1]
=
div*
((
dr+
(
float
4
)
k
)
*data[j-k-2]
+
(
-dr+
(
float
4
)
(
j-k
))
*data[j-k-1]
)
;
data[j-k-1]
=
div*
((
dr+
(
real
4
)
k
)
*data[j-k-2]
+
(
-dr+
(
real
4
)
(
j-k
))
*data[j-k-1]
)
;
data[0]
=
div*
(
-
dr+1.0f
)
*data[0]
;
}
data[PME_ORDER-1]
=
scale*dr*data[PME_ORDER-2]
;
for
(
int
j
=
1
; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1]
=
scale*
((
dr+
(
float
4
)
j
)
*data[PME_ORDER-j-2]
+
(
-dr+
(
float
4
)
(
PME_ORDER-j
))
*data[PME_ORDER-j-1]
)
;
data[PME_ORDER-j-1]
=
scale*
((
dr+
(
real
4
)
j
)
*data[PME_ORDER-j-2]
+
(
-dr+
(
real
4
)
(
PME_ORDER-j
))
*data[PME_ORDER-j-1]
)
;
data[0]
=
scale*
(
-dr+1.0f
)
*data[0]
;
for
(
int
j
=
0
; j < PME_ORDER; j++) {
data[j].w
=
pos.w
; // Storing the charge here improves cache coherency in the charge spreading kernel
...
...
@@ -39,7 +39,7 @@ __kernel void updateBsplines(__global const float4* restrict posq, __global floa
/**
*
For
each
grid
point,
find
the
range
of
sorted
atoms
associated
with
that
point.
*/
__kernel
void
findAtomRangeForGrid
(
__global
int2*
restrict
pmeAtomGridIndex,
__global
int*
restrict
pmeAtomRange,
__global
const
float
4*
restrict
posq,
float
4
periodicBoxSize,
float
4
invPeriodicBoxSize
)
{
__kernel
void
findAtomRangeForGrid
(
__global
int2*
restrict
pmeAtomGridIndex,
__global
int*
restrict
pmeAtomRange,
__global
const
real
4*
restrict
posq,
real
4
periodicBoxSize,
real
4
invPeriodicBoxSize
)
{
int
start
=
(
NUM_ATOMS*get_global_id
(
0
))
/get_global_size
(
0
)
;
int
end
=
(
NUM_ATOMS*
(
get_global_id
(
0
)
+1
))
/get_global_size
(
0
)
;
int
last
=
(
start
==
0
?
-1
:
pmeAtomGridIndex[start-1].y
)
;
...
...
@@ -66,11 +66,11 @@ __kernel void findAtomRangeForGrid(__global int2* restrict pmeAtomGridIndex, __g
*
The
grid
index
won
't
be
needed
again.
Reuse
that
component
to
hold
the
z
index,
thus
saving
*
some
work
in
the
charge
spreading
kernel.
*/
__kernel
void
recordZIndex
(
__global
int2*
restrict
pmeAtomGridIndex,
__global
const
float
4*
restrict
posq,
float
4
periodicBoxSize,
float
4
invPeriodicBoxSize
)
{
__kernel
void
recordZIndex
(
__global
int2*
restrict
pmeAtomGridIndex,
__global
const
real
4*
restrict
posq,
real
4
periodicBoxSize,
real
4
invPeriodicBoxSize
)
{
int
start
=
(
NUM_ATOMS*get_global_id
(
0
))
/get_global_size
(
0
)
;
int
end
=
(
NUM_ATOMS*
(
get_global_id
(
0
)
+1
))
/get_global_size
(
0
)
;
for
(
int
i
=
start
; i < end; ++i) {
float
posz
=
posq[pmeAtomGridIndex[i].x].z
;
real
posz
=
posq[pmeAtomGridIndex[i].x].z
;
posz
-=
floor
(
posz*invPeriodicBoxSize.z
)
*periodicBoxSize.z
;
int
z
=
((
int
)
((
posz*invPeriodicBoxSize.z
)
*GRID_SIZE_Z
))
%
GRID_SIZE_Z
;
pmeAtomGridIndex[i].y
=
z
;
...
...
@@ -83,14 +83,14 @@ __kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global co
#
define
BUFFER_SIZE
(
PME_ORDER*PME_ORDER*PME_ORDER
)
__kernel
__attribute__
((
reqd_work_group_size
(
BUFFER_SIZE,
1
,
1
)))
void
gridSpreadCharge
(
__global
const
float
4*
restrict
posq,
__global
const
int2*
restrict
pmeAtomGridIndex,
__global
const
int*
restrict
pmeAtomRange,
__global
long*
restrict
pmeGrid,
__global
const
float
4*
restrict
pmeBsplineTheta,
float
4
periodicBoxSize,
float
4
invPeriodicBoxSize
)
{
void
gridSpreadCharge
(
__global
const
real
4*
restrict
posq,
__global
const
int2*
restrict
pmeAtomGridIndex,
__global
const
int*
restrict
pmeAtomRange,
__global
long*
restrict
pmeGrid,
__global
const
real
4*
restrict
pmeBsplineTheta,
real
4
periodicBoxSize,
real
4
invPeriodicBoxSize
)
{
int
ix
=
get_local_id
(
0
)
/
(
PME_ORDER*PME_ORDER
)
;
int
remainder
=
get_local_id
(
0
)
-ix*PME_ORDER*PME_ORDER
;
int
iy
=
remainder/PME_ORDER
;
int
iz
=
remainder-iy*PME_ORDER
;
__local
float
4
theta[PME_ORDER]
;
__local
float
charge[BUFFER_SIZE]
;
__local
real
4
theta[PME_ORDER]
;
__local
real
charge[BUFFER_SIZE]
;
__local
int
basex[BUFFER_SIZE]
;
__local
int
basey[BUFFER_SIZE]
;
__local
int
basez[BUFFER_SIZE]
;
...
...
@@ -101,7 +101,7 @@ void gridSpreadCharge(__global const float4* restrict posq, __global const int2*
if
(
get_local_id
(
0
)
<
BUFFER_SIZE
)
{
int
atomIndex
=
baseIndex+get_local_id
(
0
)
;
if
(
atomIndex
<
NUM_ATOMS
)
{
float
4
pos
=
posq[atomIndex]
;
real
4
pos
=
posq[atomIndex]
;
charge[get_local_id
(
0
)
]
=
pos.w
;
pos.x
-=
floor
(
pos.x*invPeriodicBoxSize.x
)
*periodicBoxSize.x
;
pos.y
-=
floor
(
pos.y*invPeriodicBoxSize.y
)
*periodicBoxSize.y
;
...
...
@@ -118,32 +118,40 @@ void gridSpreadCharge(__global const float4* restrict posq, __global const int2*
if
(
get_local_id
(
0
)
<
PME_ORDER
)
theta[get_local_id
(
0
)
]
=
pmeBsplineTheta[atomIndex+get_local_id
(
0
)
*NUM_ATOMS]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
float
add
=
charge[index]*theta[ix].x*theta[iy].y*theta[iz].z
;
real
add
=
charge[index]*theta[ix].x*theta[iy].y*theta[iz].z
;
int
x
=
basex[index]+ix
;
int
y
=
basey[index]+iy
;
int
z
=
basez[index]+iz
;
x
-=
(
x
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
)
;
y
-=
(
y
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
)
;
z
-=
(
z
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
#
ifdef
USE_DOUBLE_PRECISION
atom_add
(
&pmeGrid[2*
(
x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z
)
],
(
long
)
(
add*0xFFFFFFFF
))
;
#
else
atom_add
(
&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z],
(
long
)
(
add*0xFFFFFFFF
))
;
#
endif
}
}
}
}
__kernel
void
finishSpreadCharge
(
__global
long*
restrict
pmeGrid
)
{
__global
float2*
float
Grid
=
(
__global
float
2*
)
pmeGrid
;
__global
real2*
real
Grid
=
(
__global
real
2*
)
pmeGrid
;
const
unsigned
int
gridSize
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
float
scale
=
EPSILON_FACTOR/
(
float
)
0xFFFFFFFF
;
real
scale
=
EPSILON_FACTOR/
(
real
)
0xFFFFFFFF
;
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
#
ifdef
USE_DOUBLE_PRECISION
long
value
=
pmeGrid[2*index]
;
#
else
long
value
=
pmeGrid[index]
;
float2
floatValue
=
(
float2
)
((
float
)
(
value*scale
)
,
0.0f
)
;
floatGrid[index]
=
floatValue
;
#
endif
real2
realValue
=
(
real2
)
((
real
)
(
value*scale
)
,
0
)
;
realGrid[index]
=
realValue
;
}
}
#
else
__kernel
void
gridSpreadCharge
(
__global
const
float
4*
restrict
posq,
__global
const
int2*
restrict
pmeAtomGridIndex,
__global
const
int*
restrict
pmeAtomRange,
__global
float
2*
restrict
pmeGrid,
__global
const
float
4*
restrict
pmeBsplineTheta
)
{
__kernel
void
gridSpreadCharge
(
__global
const
real
4*
restrict
posq,
__global
const
int2*
restrict
pmeAtomGridIndex,
__global
const
int*
restrict
pmeAtomRange,
__global
real
2*
restrict
pmeGrid,
__global
const
real
4*
restrict
pmeBsplineTheta
)
{
unsigned
int
numGridPoints
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
for
(
int
gridIndex
=
get_global_id
(
0
)
; gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
//
Compute
the
charge
on
a
grid
point.
...
...
@@ -153,7 +161,7 @@ __kernel void gridSpreadCharge(__global const float4* restrict posq, __global co
int
remainder
=
gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z
;
gridPoint.y
=
remainder/GRID_SIZE_Z
;
gridPoint.z
=
remainder-gridPoint.y*GRID_SIZE_Z
;
float
result
=
0.0f
;
real
result
=
0.0f
;
//
Loop
over
all
atoms
that
affect
this
grid
point.
...
...
@@ -174,7 +182,7 @@ __kernel void gridSpreadCharge(__global const float4* restrict posq, __global co
int
atomIndex
=
atomData.x
;
int
z
=
atomData.y
;
int
iz
=
gridPoint.z-z+
(
gridPoint.z
>=
z
?
0
:
GRID_SIZE_Z
)
;
float
atomCharge
=
pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w
;
real
atomCharge
=
pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w
;
result
+=
atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z
;
}
if
(
z1
>
gridPoint.z
)
...
...
@@ -189,21 +197,21 @@ __kernel void gridSpreadCharge(__global const float4* restrict posq, __global co
int
atomIndex
=
atomData.x
;
int
z
=
atomData.y
;
int
iz
=
gridPoint.z-z+
(
gridPoint.z
>=
z
?
0
:
GRID_SIZE_Z
)
;
float
atomCharge
=
pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w
;
real
atomCharge
=
pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w
;
result
+=
atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z
;
}
}
}
}
pmeGrid[gridIndex]
=
(
float
2
)
(
result*EPSILON_FACTOR,
0
.0f
)
;
pmeGrid[gridIndex]
=
(
real
2
)
(
result*EPSILON_FACTOR,
0
)
;
}
}
#
endif
__kernel
void
reciprocalConvolution
(
__global
float
2*
restrict
pmeGrid,
__global
float
*
restrict
energyBuffer,
__global
const
float
*
restrict
pmeBsplineModuliX,
__global
const
float
*
restrict
pmeBsplineModuliY,
__global
const
float
*
restrict
pmeBsplineModuliZ,
float
4
invPeriodicBoxSize,
float
recipScaleFactor
)
{
__kernel
void
reciprocalConvolution
(
__global
real
2*
restrict
pmeGrid,
__global
real
*
restrict
energyBuffer,
__global
const
real
*
restrict
pmeBsplineModuliX,
__global
const
real
*
restrict
pmeBsplineModuliY,
__global
const
real
*
restrict
pmeBsplineModuliZ,
real
4
invPeriodicBoxSize,
real
recipScaleFactor
)
{
const
unsigned
int
gridSize
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
float
energy
=
0.0f
;
real
energy
=
0.0f
;
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
int
kx
=
index/
(
GRID_SIZE_Y*GRID_SIZE_Z
)
;
int
remainder
=
index-kx*GRID_SIZE_Y*GRID_SIZE_Z
;
...
...
@@ -214,36 +222,36 @@ __kernel void reciprocalConvolution(__global float2* restrict pmeGrid, __global
int
mx
=
(
kx
<
(
GRID_SIZE_X+1
)
/2
)
?
kx
:
(
kx-GRID_SIZE_X
)
;
int
my
=
(
ky
<
(
GRID_SIZE_Y+1
)
/2
)
?
ky
:
(
ky-GRID_SIZE_Y
)
;
int
mz
=
(
kz
<
(
GRID_SIZE_Z+1
)
/2
)
?
kz
:
(
kz-GRID_SIZE_Z
)
;
float
mhx
=
mx*invPeriodicBoxSize.x
;
float
mhy
=
my*invPeriodicBoxSize.y
;
float
mhz
=
mz*invPeriodicBoxSize.z
;
float
bx
=
pmeBsplineModuliX[kx]
;
float
by
=
pmeBsplineModuliY[ky]
;
float
bz
=
pmeBsplineModuliZ[kz]
;
float
2
grid
=
pmeGrid[index]
;
float
m2
=
mhx*mhx+mhy*mhy+mhz*mhz
;
float
denom
=
m2*bx*by*bz
;
float
eterm
=
recipScaleFactor*EXP
(
-RECIP_EXP_FACTOR*m2
)
/denom
;
pmeGrid[index]
=
(
float
2
)
(
grid.x*eterm,
grid.y*eterm
)
;
real
mhx
=
mx*invPeriodicBoxSize.x
;
real
mhy
=
my*invPeriodicBoxSize.y
;
real
mhz
=
mz*invPeriodicBoxSize.z
;
real
bx
=
pmeBsplineModuliX[kx]
;
real
by
=
pmeBsplineModuliY[ky]
;
real
bz
=
pmeBsplineModuliZ[kz]
;
real
2
grid
=
pmeGrid[index]
;
real
m2
=
mhx*mhx+mhy*mhy+mhz*mhz
;
real
denom
=
m2*bx*by*bz
;
real
eterm
=
recipScaleFactor*EXP
(
-RECIP_EXP_FACTOR*m2
)
/denom
;
pmeGrid[index]
=
(
real
2
)
(
grid.x*eterm,
grid.y*eterm
)
;
energy
+=
eterm*
(
grid.x*grid.x
+
grid.y*grid.y
)
;
}
energyBuffer[get_global_id
(
0
)
]
+=
0.5f*energy
;
}
__kernel
void
gridInterpolateForce
(
__global
const
float
4*
restrict
posq,
__global
float
4*
restrict
forceBuffers,
__global
const
float
2*
restrict
pmeGrid,
float
4
periodicBoxSize,
float
4
invPeriodicBoxSize,
__local
float
4*
restrict
bsplinesCache
)
{
const
float
4
scale
=
1
.0f/
(
PME_ORDER-1
)
;
__local
float
4*
data
=
&bsplinesCache[get_local_id
(
0
)
*PME_ORDER]
;
__local
float
4*
ddata
=
&bsplinesCache[get_local_id
(
0
)
*PME_ORDER
+
get_local_size
(
0
)
*PME_ORDER]
;
__kernel
void
gridInterpolateForce
(
__global
const
real
4*
restrict
posq,
__global
real
4*
restrict
forceBuffers,
__global
const
real
2*
restrict
pmeGrid,
real
4
periodicBoxSize,
real
4
invPeriodicBoxSize,
__local
real
4*
restrict
bsplinesCache
)
{
const
real
4
scale
=
1
/
(
real
)
(
PME_ORDER-1
)
;
__local
real
4*
data
=
&bsplinesCache[get_local_id
(
0
)
*PME_ORDER]
;
__local
real
4*
ddata
=
&bsplinesCache[get_local_id
(
0
)
*PME_ORDER
+
get_local_size
(
0
)
*PME_ORDER]
;
for
(
int
atom
=
get_global_id
(
0
)
; atom < NUM_ATOMS; atom += get_global_size(0)) {
float
4
force
=
0.0f
;
float
4
pos
=
posq[atom]
;
real
4
force
=
0.0f
;
real
4
pos
=
posq[atom]
;
pos.x
-=
floor
(
pos.x*invPeriodicBoxSize.x
)
*periodicBoxSize.x
;
pos.y
-=
floor
(
pos.y*invPeriodicBoxSize.y
)
*periodicBoxSize.y
;
pos.z
-=
floor
(
pos.z*invPeriodicBoxSize.z
)
*periodicBoxSize.z
;
float
4
t
=
(
float
4
)
((
pos.x*invPeriodicBoxSize.x
)
*GRID_SIZE_X,
(
pos.y*invPeriodicBoxSize.y
)
*GRID_SIZE_Y,
(
pos.z*invPeriodicBoxSize.z
)
*GRID_SIZE_Z,
0.0f
)
;
real
4
t
=
(
real
4
)
((
pos.x*invPeriodicBoxSize.x
)
*GRID_SIZE_X,
(
pos.y*invPeriodicBoxSize.y
)
*GRID_SIZE_Y,
(
pos.z*invPeriodicBoxSize.z
)
*GRID_SIZE_Z,
0.0f
)
;
int4
gridIndex
=
(
int4
)
(((
int
)
t.x
)
%
GRID_SIZE_X,
((
int
)
t.y
)
%
GRID_SIZE_Y,
((
int
)
t.z
)
%
GRID_SIZE_Z,
0
)
;
...
...
@@ -251,15 +259,15 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa
//
Since
we
need
the
full
set
of
thetas,
it
's
faster
to
compute
them
here
than
load
them
//
from
global
memory.
float
4
dr
=
(
float
4
)
(
t.x-
(
int
)
t.x,
t.y-
(
int
)
t.y,
t.z-
(
int
)
t.z,
0.0f
)
;
real
4
dr
=
(
real
4
)
(
t.x-
(
int
)
t.x,
t.y-
(
int
)
t.y,
t.z-
(
int
)
t.z,
0.0f
)
;
data[PME_ORDER-1]
=
0.0f
;
data[1]
=
dr
;
data[0]
=
1.0f-dr
;
for
(
int
j
=
3
; j < PME_ORDER; j++) {
float
div
=
1.0f/
(
j-1.0f
)
;
real
div
=
RECIP
(
j-1.0f
)
;
data[j-1]
=
div*dr*data[j-2]
;
for
(
int
k
=
1
; k < (j-1); k++)
data[j-k-1]
=
div*
((
dr+
(
float
4
)
k
)
*data[j-k-2]
+
(
-dr+
(
float
4
)
(
j-k
))
*data[j-k-1]
)
;
data[j-k-1]
=
div*
((
dr+
(
real
4
)
k
)
*data[j-k-2]
+
(
-dr+
(
real
4
)
(
j-k
))
*data[j-k-1]
)
;
data[0]
=
div*
(
-
dr+1.0f
)
*data[0]
;
}
ddata[0]
=
-data[0]
;
...
...
@@ -267,7 +275,7 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa
ddata[j]
=
data[j-1]-data[j]
;
data[PME_ORDER-1]
=
scale*dr*data[PME_ORDER-2]
;
for
(
int
j
=
1
; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1]
=
scale*
((
dr+
(
float
4
)
j
)
*data[PME_ORDER-j-2]
+
(
-dr+
(
float
4
)
(
PME_ORDER-j
))
*data[PME_ORDER-j-1]
)
;
data[PME_ORDER-j-1]
=
scale*
((
dr+
(
real
4
)
j
)
*data[PME_ORDER-j-2]
+
(
-dr+
(
real
4
)
(
PME_ORDER-j
))
*data[PME_ORDER-j-1]
)
;
data[0]
=
scale*
(
-dr+1.0f
)
*data[0]
;
//
Compute
the
force
on
this
atom.
...
...
@@ -282,7 +290,7 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa
int
zindex
=
gridIndex.z+iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
int
index
=
xindex*GRID_SIZE_Y*GRID_SIZE_Z
+
yindex*GRID_SIZE_Z
+
zindex
;
float
gridvalue
=
pmeGrid[index].x
;
real
gridvalue
=
pmeGrid[index].x
;
force.x
+=
ddata[ix].x*data[iy].y*data[iz].z*gridvalue
;
force.y
+=
data[ix].x*ddata[iy].y*data[iz].z*gridvalue
;
#
ifndef
MAC_AMD_WORKAROUND
...
...
@@ -302,14 +310,14 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa
int
zindex
=
gridIndex.z+iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
int
index
=
xindex*GRID_SIZE_Y*GRID_SIZE_Z
+
yindex*GRID_SIZE_Z
+
zindex
;
float
gridvalue
=
pmeGrid[index].x
;
real
gridvalue
=
pmeGrid[index].x
;
force.z
+=
data[ix].x*data[iy].y*ddata[iz].z*gridvalue
;
}
}
}
#
endif
float
4
totalForce
=
forceBuffers[atom]
;
float
q
=
pos.w*EPSILON_FACTOR
;
real
4
totalForce
=
forceBuffers[atom]
;
real
q
=
pos.w*EPSILON_FACTOR
;
totalForce.x
-=
q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x
;
totalForce.y
-=
q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y
;
totalForce.z
-=
q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z
;
...
...
platforms/opencl/src/kernels/rbTorsionForce.cl
View file @
c8dac206
...
...
@@ -4,9 +4,9 @@ if (theta < 0.0f)
else
theta
-=
PI
;
cosangle
=
-cosangle
;
float
cosFactor
=
cosangle
;
float
dEdAngle
=
-torsionParams.s1
;
float
rbEnergy
=
torsionParams.s0
;
real
cosFactor
=
cosangle
;
real
dEdAngle
=
-torsionParams.s1
;
real
rbEnergy
=
torsionParams.s0
;
rbEnergy
+=
torsionParams.s1*cosFactor
;
dEdAngle
-=
2.0f*torsionParams.s2*cosFactor
;
cosFactor
*=
cosangle
;
...
...
platforms/opencl/src/kernels/torsionForce.cl
View file @
c8dac206
const
float
PI
=
3.14159265358979323846f
;
float
4
v0
=
(
float
4
)
(
pos1.xyz-pos2.xyz,
0.0f
)
;
float
4
v1
=
(
float
4
)
(
pos3.xyz-pos2.xyz,
0.0f
)
;
float
4
v2
=
(
float
4
)
(
pos3.xyz-pos4.xyz,
0.0f
)
;
float
4
cp0
=
cross
(
v0,
v1
)
;
float
4
cp1
=
cross
(
v1,
v2
)
;
float
cosangle
=
dot
(
normalize
(
cp0
)
,
normalize
(
cp1
))
;
float
theta
;
const
real
PI
=
3.14159265358979323846f
;
real
4
v0
=
(
real
4
)
(
pos1.xyz-pos2.xyz,
0.0f
)
;
real
4
v1
=
(
real
4
)
(
pos3.xyz-pos2.xyz,
0.0f
)
;
real
4
v2
=
(
real
4
)
(
pos3.xyz-pos4.xyz,
0.0f
)
;
real
4
cp0
=
cross
(
v0,
v1
)
;
real
4
cp1
=
cross
(
v1,
v2
)
;
real
cosangle
=
dot
(
normalize
(
cp0
)
,
normalize
(
cp1
))
;
real
theta
;
if
(
cosangle
>
0.99f
||
cosangle
<
-0.99f
)
{
//
We
're
close
to
the
singularity
in
acos
()
,
so
take
the
cross
product
and
use
asin
()
instead.
float
4
cross_prod
=
cross
(
cp0,
cp1
)
;
float
scale
=
dot
(
cp0,
cp0
)
*dot
(
cp1,
cp1
)
;
real
4
cross_prod
=
cross
(
cp0,
cp1
)
;
real
scale
=
dot
(
cp0,
cp0
)
*dot
(
cp1,
cp1
)
;
theta
=
asin
(
sqrt
(
dot
(
cross_prod,
cross_prod
)
/scale
))
;
if
(
cosangle
<
0
.0f
)
if
(
cosangle
<
0
)
theta
=
PI-theta
;
}
else
theta
=
acos
(
cosangle
)
;
theta
=
(
dot
(
v0,
cp1
)
>=
0
?
theta
:
-theta
)
;
COMPUTE_FORCE
float
normCross1
=
dot
(
cp0,
cp0
)
;
float
normSqrBC
=
dot
(
v1,
v1
)
;
float
normBC
=
sqrt
(
normSqrBC
)
;
float
normCross2
=
dot
(
cp1,
cp1
)
;
float
dp
=
1.0f/normSqrBC
;
float
4
ff
=
(
float
4
)
((
-dEdAngle*normBC
)
/normCross1,
dot
(
v0,
v1
)
*dp,
dot
(
v2,
v1
)
*dp,
(
dEdAngle*normBC
)
/normCross2
)
;
float
4
force1
=
ff.x*cp0
;
float
4
force4
=
ff.w*cp1
;
float
4
s
=
ff.y*force1
-
ff.z*force4
;
float
4
force2
=
s-force1
;
float
4
force3
=
-s-force4
;
real
normCross1
=
dot
(
cp0,
cp0
)
;
real
normSqrBC
=
dot
(
v1,
v1
)
;
real
normBC
=
sqrt
(
normSqrBC
)
;
real
normCross2
=
dot
(
cp1,
cp1
)
;
real
dp
=
1.0f/normSqrBC
;
real
4
ff
=
(
real
4
)
((
-dEdAngle*normBC
)
/normCross1,
dot
(
v0,
v1
)
*dp,
dot
(
v2,
v1
)
*dp,
(
dEdAngle*normBC
)
/normCross2
)
;
real
4
force1
=
ff.x*cp0
;
real
4
force4
=
ff.w*cp1
;
real
4
s
=
ff.y*force1
-
ff.z*force4
;
real
4
force2
=
s-force1
;
real
4
force3
=
-s-force4
;
platforms/opencl/src/kernels/utilities.cl
View file @
c8dac206
...
...
@@ -69,11 +69,11 @@ __kernel void clearSixBuffers(__global int* restrict buffer1, int size1, __globa
*
Sum
a
collection
of
buffers
into
the
first
one.
*/
__kernel
void
reduce
Float
4Buffer
(
__global
float
4*
restrict
buffer,
int
bufferSize,
int
numBuffers
)
{
__kernel
void
reduce
Real
4Buffer
(
__global
real
4*
restrict
buffer,
int
bufferSize,
int
numBuffers
)
{
int
index
=
get_global_id
(
0
)
;
int
totalSize
=
bufferSize*numBuffers
;
while
(
index
<
bufferSize
)
{
float
4
sum
=
buffer[index]
;
real
4
sum
=
buffer[index]
;
for
(
int
i
=
index+bufferSize
; i < totalSize; i += bufferSize)
sum
+=
buffer[i]
;
buffer[index]
=
sum
;
...
...
@@ -84,11 +84,11 @@ __kernel void reduceFloat4Buffer(__global float4* restrict buffer, int bufferSiz
/**
*
Sum
the
various
buffers
containing
forces.
*/
__kernel
void
reduceForces
(
__global
const
long*
restrict
longBuffer,
__global
float
4*
restrict
buffer,
int
bufferSize,
int
numBuffers
)
{
__kernel
void
reduceForces
(
__global
const
long*
restrict
longBuffer,
__global
real
4*
restrict
buffer,
int
bufferSize,
int
numBuffers
)
{
int
totalSize
=
bufferSize*numBuffers
;
float
scale
=
1
.0f/
(
float
)
0xFFFFFFFF
;
real
scale
=
1
/
(
real
)
0xFFFFFFFF
;
for
(
int
index
=
get_global_id
(
0
)
; index < bufferSize; index += get_global_size(0)) {
float
4
sum
=
(
float
4
)
(
scale*longBuffer[index],
scale*longBuffer[index+bufferSize],
scale*longBuffer[index+2*bufferSize],
0
.0f
)
;
real
4
sum
=
(
real
4
)
(
scale*longBuffer[index],
scale*longBuffer[index+bufferSize],
scale*longBuffer[index+2*bufferSize],
0
)
;
for
(
int
i
=
index
; i < totalSize; i += bufferSize)
sum
+=
buffer[i]
;
buffer[index]
=
sum
;
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment