Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
5a06df78
Commit
5a06df78
authored
Mar 04, 2020
by
tic20
Browse files
Merge
https://github.com/openmm/openmm
parents
8dd60914
a9223eea
Changes
335
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
675 additions
and
469 deletions
+675
-469
platforms/common/src/kernels/andersenThermostat.cc
platforms/common/src/kernels/andersenThermostat.cc
+5
-5
platforms/common/src/kernels/angleForce.cc
platforms/common/src/kernels/angleForce.cc
+18
-0
platforms/common/src/kernels/bondForce.cc
platforms/common/src/kernels/bondForce.cc
+10
-0
platforms/common/src/kernels/brownian.cc
platforms/common/src/kernels/brownian.cc
+11
-8
platforms/common/src/kernels/cmapTorsionForce.cc
platforms/common/src/kernels/cmapTorsionForce.cc
+0
-0
platforms/common/src/kernels/customCentroidBond.cc
platforms/common/src/kernels/customCentroidBond.cc
+32
-39
platforms/common/src/kernels/customCompoundBond.cc
platforms/common/src/kernels/customCompoundBond.cc
+7
-13
platforms/common/src/kernels/customExternalForce.cc
platforms/common/src/kernels/customExternalForce.cc
+0
-0
platforms/common/src/kernels/customGBChainRule.cc
platforms/common/src/kernels/customGBChainRule.cc
+0
-0
platforms/common/src/kernels/customGBEnergyN2.cc
platforms/common/src/kernels/customGBEnergyN2.cc
+95
-85
platforms/common/src/kernels/customGBEnergyN2_cpu.cc
platforms/common/src/kernels/customGBEnergyN2_cpu.cc
+80
-70
platforms/common/src/kernels/customGBEnergyPerParticle.cc
platforms/common/src/kernels/customGBEnergyPerParticle.cc
+11
-6
platforms/common/src/kernels/customGBGradientChainRule.cc
platforms/common/src/kernels/customGBGradientChainRule.cc
+31
-0
platforms/common/src/kernels/customGBValueN2.cc
platforms/common/src/kernels/customGBValueN2.cc
+53
-55
platforms/common/src/kernels/customGBValueN2_cpu.cc
platforms/common/src/kernels/customGBValueN2_cpu.cc
+39
-41
platforms/common/src/kernels/customGBValuePerParticle.cc
platforms/common/src/kernels/customGBValuePerParticle.cc
+5
-7
platforms/common/src/kernels/customHbondForce.cc
platforms/common/src/kernels/customHbondForce.cc
+110
-55
platforms/common/src/kernels/customIntegrator.cc
platforms/common/src/kernels/customIntegrator.cc
+19
-17
platforms/common/src/kernels/customIntegratorPerDof.cc
platforms/common/src/kernels/customIntegratorPerDof.cc
+74
-0
platforms/common/src/kernels/customManyParticle.cc
platforms/common/src/kernels/customManyParticle.cc
+75
-68
No files found.
platforms/c
uda
/src/kernels/andersenThermostat.c
u
→
platforms/c
ommon
/src/kernels/andersenThermostat.c
c
View file @
5a06df78
...
...
@@ -2,11 +2,11 @@
* Apply the Andersen thermostat to adjust particle velocities.
*/
extern
"C"
__global__
void
applyAndersenThermostat
(
int
numAtoms
,
float
collisionFrequency
,
float
kT
,
mixed4
*
velm
,
const
mixed4
*
__restrict__
stepSize
,
const
float4
*
__restrict__
random
,
unsigned
int
randomIndex
,
const
int
*
__restrict__
atomGroups
)
{
float
collisionProbability
=
1.0
f
-
expf
(
-
(
float
)
(
collisionFrequency
*
stepSize
[
0
].
y
));
float
randomRange
=
erf
f
(
collisionProbability
/
sqrtf
(
2.0
f
));
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numAtoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
KERNEL
void
applyAndersenThermostat
(
int
numAtoms
,
float
collisionFrequency
,
float
kT
,
GLOBAL
mixed4
*
velm
,
real
stepSize
,
GLOBAL
const
float4
*
RESTRICT
random
,
unsigned
int
randomIndex
,
GLOBAL
const
int
*
RESTRICT
atomGroups
)
{
float
collisionProbability
=
(
float
)
(
1
-
EXP
(
-
collisionFrequency
*
stepSize
));
float
randomRange
=
(
float
)
erf
(
collisionProbability
/
SQRT
(
2.0
f
));
for
(
int
index
=
GLOBAL_ID
;
index
<
numAtoms
;
index
+=
GLOBAL_SIZE
)
{
mixed4
velocity
=
velm
[
index
];
float4
selectRand
=
random
[
randomIndex
+
atomGroups
[
index
]];
float4
velRand
=
random
[
randomIndex
+
index
];
...
...
platforms/common/src/kernels/angleForce.cc
0 → 100644
View file @
5a06df78
real3
v0
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
real3
v1
=
make_real3
(
pos2
.
x
-
pos3
.
x
,
pos2
.
y
-
pos3
.
y
,
pos2
.
z
-
pos3
.
z
);
#if APPLY_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
v0
)
APPLY_PERIODIC_TO_DELTA
(
v1
)
#endif
real3
cp
=
cross
(
v0
,
v1
);
real
rp
=
cp
.
x
*
cp
.
x
+
cp
.
y
*
cp
.
y
+
cp
.
z
*
cp
.
z
;
rp
=
max
(
SQRT
(
rp
),
(
real
)
1.0e-06
f
);
real
r21
=
v0
.
x
*
v0
.
x
+
v0
.
y
*
v0
.
y
+
v0
.
z
*
v0
.
z
;
real
r23
=
v1
.
x
*
v1
.
x
+
v1
.
y
*
v1
.
y
+
v1
.
z
*
v1
.
z
;
real
dot
=
v0
.
x
*
v1
.
x
+
v0
.
y
*
v1
.
y
+
v0
.
z
*
v1
.
z
;
real
cosine
=
min
(
max
(
dot
*
RSQRT
(
r21
*
r23
),
(
real
)
-
1
),
(
real
)
1
);
real
theta
=
ACOS
(
cosine
);
COMPUTE_FORCE
real3
force1
=
cross
(
v0
,
cp
)
*
(
dEdAngle
/
(
r21
*
rp
));
real3
force3
=
cross
(
cp
,
v1
)
*
(
dEdAngle
/
(
r23
*
rp
));
real3
force2
=
-
force1
-
force3
;
platforms/common/src/kernels/bondForce.cc
0 → 100644
View file @
5a06df78
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#if APPLY_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r
=
SQRT
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
);
COMPUTE_FORCE
dEdR
=
(
r
>
0
)
?
(
dEdR
/
r
)
:
0
;
delta
*=
dEdR
;
real3
force1
=
delta
;
real3
force2
=
-
delta
;
platforms/c
uda
/src/kernels/brownian.c
u
→
platforms/c
ommon
/src/kernels/brownian.c
c
View file @
5a06df78
...
...
@@ -2,18 +2,18 @@
* Perform the first step of Brownian integration.
*/
extern
"C"
__global__
void
integrateBrownianPart1
(
int
numAtoms
,
int
paddedNumAtoms
,
mixed
tauDeltaT
,
mixed
noiseAmplitude
,
const
long
long
*
__restrict__
force
,
mixed4
*
__restrict__
posDelta
,
const
mixed4
*
__restrict__
velm
,
const
float4
*
__restrict__
random
,
unsigned
int
randomIndex
)
{
randomIndex
+=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
KERNEL
void
integrateBrownianPart1
(
int
numAtoms
,
int
paddedNumAtoms
,
mixed
tauDeltaT
,
mixed
noiseAmplitude
,
GLOBAL
const
mm_
long
*
RESTRICT
force
,
GLOBAL
mixed4
*
RESTRICT
posDelta
,
GLOBAL
const
mixed4
*
RESTRICT
velm
,
GLOBAL
const
float4
*
RESTRICT
random
,
unsigned
int
randomIndex
)
{
randomIndex
+=
GLOBAL_ID
;
const
mixed
fscale
=
tauDeltaT
/
(
mixed
)
0x100000000
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numAtoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
numAtoms
;
index
+=
GLOBAL_SIZE
)
{
mixed
invMass
=
velm
[
index
].
w
;
if
(
invMass
!=
0
)
{
posDelta
[
index
].
x
=
fscale
*
invMass
*
force
[
index
]
+
noiseAmplitude
*
SQRT
(
invMass
)
*
random
[
randomIndex
].
x
;
posDelta
[
index
].
y
=
fscale
*
invMass
*
force
[
index
+
paddedNumAtoms
]
+
noiseAmplitude
*
SQRT
(
invMass
)
*
random
[
randomIndex
].
y
;
posDelta
[
index
].
z
=
fscale
*
invMass
*
force
[
index
+
paddedNumAtoms
*
2
]
+
noiseAmplitude
*
SQRT
(
invMass
)
*
random
[
randomIndex
].
z
;
}
randomIndex
+=
blockDim
.
x
*
gridDim
.
x
;
randomIndex
+=
GLOBAL_SIZE
;
}
}
...
...
@@ -21,9 +21,12 @@ extern "C" __global__ void integrateBrownianPart1(int numAtoms, int paddedNumAto
* Perform the second step of Brownian integration.
*/
extern
"C"
__global__
void
integrateBrownianPart2
(
int
numAtoms
,
mixed
deltaT
,
real4
*
posq
,
real4
*
__restrict__
posqCorrection
,
mixed4
*
velm
,
const
mixed4
*
__restrict__
posDelta
)
{
const
mixed
oneOverDeltaT
=
RECIP
(
deltaT
);
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numAtoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
KERNEL
void
integrateBrownianPart2
(
int
numAtoms
,
mixed
oneOverDeltaT
,
GLOBAL
real4
*
posq
,
GLOBAL
mixed4
*
velm
,
GLOBAL
const
mixed4
*
RESTRICT
posDelta
#ifdef USE_MIXED_PRECISION
,
GLOBAL
real4
*
RESTRICT
posqCorrection
#endif
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
numAtoms
;
index
+=
GLOBAL_SIZE
)
{
if
(
velm
[
index
].
w
!=
0
)
{
mixed4
delta
=
posDelta
[
index
];
velm
[
index
].
x
=
oneOverDeltaT
*
delta
.
x
;
...
...
platforms/c
uda
/src/kernels/cmapTorsionForce.c
u
→
platforms/c
ommon
/src/kernels/cmapTorsionForce.c
c
View file @
5a06df78
File moved
platforms/
opencl
/src/kernels/customCentroidBond.c
l
→
platforms/
common
/src/kernels/customCentroidBond.c
c
View file @
5a06df78
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
/**
* Compute the center of each group.
*/
__kernel
void
computeGroupCenters
(
__global
const
real4*
restrict
posq,
__global
const
int*
restrict
groupParticles,
__global
const
real*
restrict
groupWeights,
__global
const
int*
restrict
groupOffsets,
__global
real4*
restrict
centerPositions
)
{
__local
volatile
real3
temp[64]
;
for
(
int
group
=
get_group_id
(
0
)
; group
<
NUM_GROUPS
; group += get_num_groups(0)
) {
KERNEL
void
computeGroupCenters
(
int
numParticleGroups
,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
int
*
RESTRICT
groupParticles
,
GLOBAL
const
real
*
RESTRICT
groupWeights
,
GLOBAL
const
int
*
RESTRICT
groupOffsets
,
GLOBAL
real4
*
RESTRICT
centerPositions
)
{
LOCAL
volatile
real3
temp
[
64
];
for
(
int
group
=
GROUP_ID
;
group
<
numParticleGroups
;
group
+=
NUM_GROUPS
)
{
// The threads in this block work together to compute the center one group.
int
firstIndex
=
groupOffsets
[
group
];
int
lastIndex
=
groupOffsets
[
group
+
1
];
real3
center
=
(
real3
)
0
;
for
(
int
index
=
get_local_id
(
0
)
; index < lastIndex-firstIndex; index +=
get_local_size(0)
) {
real3
center
=
make_
real3
(
0
)
;
for
(
int
index
=
LOCAL_ID
;
index
<
lastIndex
-
firstIndex
;
index
+=
LOCAL_SIZE
)
{
int
atom
=
groupParticles
[
firstIndex
+
index
];
real
weight
=
groupWeights
[
firstIndex
+
index
];
real4
pos
=
posq
[
atom
];
...
...
@@ -23,18 +21,16 @@ __kernel void computeGroupCenters(__global const real4* restrict posq, __global
// Sum the values.
int
thread
=
get_local_id
(
0
)
;
int
thread
=
LOCAL_ID
;
temp
[
thread
].
x
=
center
.
x
;
temp
[
thread
].
y
=
center
.
y
;
temp
[
thread
].
z
=
center
.
z
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
SYNC_THREADS
;
if
(
thread
<
32
)
{
temp
[
thread
].
x
+=
temp
[
thread
+
32
].
x
;
temp
[
thread
].
y
+=
temp
[
thread
+
32
].
y
;
temp
[
thread
].
z
+=
temp
[
thread
+
32
].
z
;
}
SYNC_WARPS
;
if
(
thread
<
16
)
{
temp
[
thread
].
x
+=
temp
[
thread
+
16
].
x
;
...
...
@@ -47,7 +43,6 @@ __kernel void computeGroupCenters(__global const real4* restrict posq, __global
temp
[
thread
].
y
+=
temp
[
thread
+
8
].
y
;
temp
[
thread
].
z
+=
temp
[
thread
+
8
].
z
;
}
SYNC_WARPS
;
if
(
thread
<
4
)
{
temp
[
thread
].
x
+=
temp
[
thread
+
4
].
x
;
...
...
@@ -60,19 +55,18 @@ __kernel void computeGroupCenters(__global const real4* restrict posq, __global
temp
[
thread
].
y
+=
temp
[
thread
+
2
].
y
;
temp
[
thread
].
z
+=
temp
[
thread
+
2
].
z
;
}
SYNC_WARPS
;
if
(
thread
==
0
)
centerPositions[group]
=
(
real4
)
(
temp[0].x+temp[1].x,
temp[0].y+temp[1].y,
temp[0].z+temp[1].z,
0
)
;
centerPositions
[
group
]
=
make_
real4
(
temp
[
0
].
x
+
temp
[
1
].
x
,
temp
[
0
].
y
+
temp
[
1
].
y
,
temp
[
0
].
z
+
temp
[
1
].
z
,
0
);
}
}
/**
* Compute the difference between two vectors, setting the fourth component to the squared magnitude.
*/
real4
delta
(
real4
vec1,
real4
vec2,
bool
periodic,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
DEVICE
real4
delta
(
real4
vec1
,
real4
vec2
,
bool
periodic
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
)
{
real4
result
=
(
real4
)
(
vec1.x-vec2.x,
vec1.y-vec2.y,
vec1.z-vec2.z,
0
)
;
real4
result
=
make_
real4
(
vec1
.
x
-
vec2
.
x
,
vec1
.
y
-
vec2
.
y
,
vec1
.
z
-
vec2
.
z
,
0
);
if
(
periodic
)
APPLY_PERIODIC_TO_DELTA
(
result
);
result
.
w
=
result
.
x
*
result
.
x
+
result
.
y
*
result
.
y
+
result
.
z
*
result
.
z
;
...
...
@@ -82,65 +76,64 @@ real4 delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4
/**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/
real
computeAngle
(
real4
vec1,
real4
vec2
)
{
DEVICE
real
computeAngle
(
real4
vec1
,
real4
vec2
)
{
real
dotProduct
=
vec1
.
x
*
vec2
.
x
+
vec1
.
y
*
vec2
.
y
+
vec1
.
z
*
vec2
.
z
;
real
cosine
=
dotProduct
*
RSQRT
(
vec1
.
w
*
vec2
.
w
);
real
angle
;
if
(
cosine
>
0.99
f
||
cosine
<
-
0.99
f
)
{
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
real
4
crossProduct
=
cross
(
vec1,
vec2
)
;
real
3
crossProduct
=
cross
(
trimTo3
(
vec1
)
,
trimTo3
(
vec2
)
)
;
real
scale
=
vec1
.
w
*
vec2
.
w
;
angle
=
asin
(
SQRT
(
dot
(
crossProduct,
crossProduct
)
/scale
))
;
angle
=
ASIN
(
SQRT
(
dot
(
crossProduct
,
crossProduct
)
/
scale
));
if
(
cosine
<
0
)
angle
=
M_PI
-
angle
;
}
else
angle
=
acos
(
cosine
)
;
angle
=
ACOS
(
cosine
);
return
angle
;
}
/**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/
real4
computeCross
(
real4
vec1,
real4
vec2
)
{
real4
result
=
cross
(
vec1,
vec2
)
;
result.w
=
result.x*result.x
+
result.y*result.y
+
result.z*result.z
;
return
result
;
DEVICE
real4
computeCross
(
real4
vec1
,
real4
vec2
)
{
real3
cp
=
cross
(
trimTo3
(
vec1
),
trimTo3
(
vec2
));
return
make_real4
(
cp
.
x
,
cp
.
y
,
cp
.
z
,
cp
.
x
*
cp
.
x
+
cp
.
y
*
cp
.
y
+
cp
.
z
*
cp
.
z
);
}
/**
* Compute the forces on groups based on the bonds.
*/
__kernel
void
computeGroupForces
(
__global
long*
restrict
groupForce,
__global
mixed*
restrict
energyBuffer,
__global
const
real4*
restrict
centerPositions,
__global
const
int*
restrict
bondGroups,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ
KERNEL
void
computeGroupForces
(
int
numParticleGroups
,
GLOBAL
mm_ulong
*
RESTRICT
groupForce
,
GLOBAL
mixed
*
RESTRICT
energyBuffer
,
GLOBAL
const
real4
*
RESTRICT
centerPositions
,
GLOBAL
const
int
*
RESTRICT
bondGroups
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
EXTRA_ARGS
)
{
mixed
energy
=
0
;
INIT_PARAM_DERIVS
for
(
int
index
=
get_global_id
(
0
)
; index < NUM_BONDS; index +=
get_global_size(0)
) {
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_BONDS
;
index
+=
GLOBAL_SIZE
)
{
COMPUTE_FORCE
}
energyBuffer[
get_global_id
(
0
)
]
+=
energy
;
energyBuffer
[
GLOBAL_ID
]
+=
energy
;
SAVE_PARAM_DERIVS
}
/**
* Apply the forces from the group centers to the individual atoms.
*/
__kernel
void
applyForcesToAtoms
(
__global
const
int*
restrict
groupParticles,
__global
const
real*
restrict
groupWeights,
__global
const
int*
restrict
groupOffsets,
__global
const
long*
restrict
groupForce,
__global
long*
restrict
atomForce
)
{
for
(
int
group
=
get_group_id
(
0
)
; group
<
NUM_GROUPS
; group += get_num_groups(0)
) {
long
fx
=
groupForce[group]
;
long
fy
=
groupForce[group+
NUM_GROUPS
]
;
long
fz
=
groupForce[group+
NUM_GROUPS
*2]
;
KERNEL
void
applyForcesToAtoms
(
int
numParticleGroups
,
GLOBAL
const
int
*
RESTRICT
groupParticles
,
GLOBAL
const
real
*
RESTRICT
groupWeights
,
GLOBAL
const
int
*
RESTRICT
groupOffsets
,
GLOBAL
const
mm_
long
*
RESTRICT
groupForce
,
GLOBAL
mm_ulong
*
RESTRICT
atomForce
)
{
for
(
int
group
=
GROUP_ID
;
group
<
numParticleGroups
;
group
+=
NUM_GROUPS
)
{
mm_
long
fx
=
groupForce
[
group
];
mm_
long
fy
=
groupForce
[
group
+
numParticleGroups
];
mm_
long
fz
=
groupForce
[
group
+
numParticleGroups
*
2
];
int
firstIndex
=
groupOffsets
[
group
];
int
lastIndex
=
groupOffsets
[
group
+
1
];
for
(
int
index
=
get_local_id
(
0
)
; index < lastIndex-firstIndex; index +=
get_local_size(0)
) {
for
(
int
index
=
LOCAL_ID
;
index
<
lastIndex
-
firstIndex
;
index
+=
LOCAL_SIZE
)
{
int
atom
=
groupParticles
[
firstIndex
+
index
];
real
weight
=
groupWeights
[
firstIndex
+
index
];
atom_add
(
&atomForce[atom],
(
long
)
(
fx*weight
))
;
atom_add
(
&atomForce[atom+PADDED_NUM_ATOMS],
(
long
)
(
fy*weight
))
;
atom_add
(
&atomForce[atom+2*PADDED_NUM_ATOMS],
(
long
)
(
fz*weight
))
;
ATOMIC_ADD
(
&
atomForce
[
atom
],
(
mm_ulong
)
((
mm_
long
)
(
fx
*
weight
))
)
;
ATOMIC_ADD
(
&
atomForce
[
atom
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
fy
*
weight
))
)
;
ATOMIC_ADD
(
&
atomForce
[
atom
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
fz
*
weight
))
)
;
}
}
}
platforms/c
uda
/src/kernels/customCompoundBond.c
u
→
platforms/c
ommon
/src/kernels/customCompoundBond.c
c
View file @
5a06df78
/**
* Convert a real4 to a real3 by removing its last element.
*/
inline
__device__
real3
ccb_trim
(
real4
v
)
{
return
make_real3
(
v
.
x
,
v
.
y
,
v
.
z
);
}
/**
* Compute the difference between two vectors, setting the fourth component to the squared magnitude.
*/
inline
__device__
real4
ccb_delta
(
real4
vec1
,
real4
vec2
,
bool
periodic
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
DEVICE
real4
ccb_delta
(
real4
vec1
,
real4
vec2
,
bool
periodic
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
)
{
real4
result
=
make_real4
(
vec1
.
x
-
vec2
.
x
,
vec1
.
y
-
vec2
.
y
,
vec1
.
z
-
vec2
.
z
,
0
);
if
(
periodic
)
...
...
@@ -20,17 +13,17 @@ inline __device__ real4 ccb_delta(real4 vec1, real4 vec2, bool periodic, real4 p
/**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/
__device__
real
ccb_computeAngle
(
real4
vec1
,
real4
vec2
)
{
DEVICE
real
ccb_computeAngle
(
real4
vec1
,
real4
vec2
)
{
real
dotProduct
=
vec1
.
x
*
vec2
.
x
+
vec1
.
y
*
vec2
.
y
+
vec1
.
z
*
vec2
.
z
;
real
cosine
=
dotProduct
*
RSQRT
(
vec1
.
w
*
vec2
.
w
);
real
angle
;
if
(
cosine
>
0.99
f
||
cosine
<
-
0.99
f
)
{
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
real3
crossProduct
=
cross
(
vec1
,
vec2
);
real3
crossProduct
=
cross
(
trimTo3
(
vec1
)
,
trimTo3
(
vec2
)
)
;
real
scale
=
vec1
.
w
*
vec2
.
w
;
angle
=
ASIN
(
SQRT
(
dot
(
crossProduct
,
crossProduct
)
/
scale
));
if
(
cosine
<
0
.0
f
)
if
(
cosine
<
0
)
angle
=
M_PI
-
angle
;
}
else
...
...
@@ -41,7 +34,8 @@ __device__ real ccb_computeAngle(real4 vec1, real4 vec2) {
/**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/
inline
__device__
real4
ccb_computeCross
(
real4
vec1
,
real4
vec2
)
{
real3
cp
=
cross
(
vec1
,
vec2
);
DEVICE
real4
ccb_computeCross
(
real4
vec1
,
real4
vec2
)
{
real3
cp
=
cross
(
trimTo3
(
vec1
)
,
trimTo3
(
vec2
)
)
;
return
make_real4
(
cp
.
x
,
cp
.
y
,
cp
.
z
,
cp
.
x
*
cp
.
x
+
cp
.
y
*
cp
.
y
+
cp
.
z
*
cp
.
z
);
}
platforms/c
uda
/src/kernels/customExternalForce.c
u
→
platforms/c
ommon
/src/kernels/customExternalForce.c
c
View file @
5a06df78
File moved
platforms/c
uda
/src/kernels/customGBChainRule.c
u
→
platforms/c
ommon
/src/kernels/customGBChainRule.c
c
View file @
5a06df78
File moved
platforms/
opencl
/src/kernels/customGBEnergyN2.c
l
→
platforms/
common
/src/kernels/customGBEnergyN2.c
c
View file @
5a06df78
#ifdef SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
define
STORE_DERIVATIVE_1
(
INDEX
)
atom_add
(
&derivBuffers[offset+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
deriv##INDEX##_1*0x100000000
))
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
atom_add
(
&derivBuffers[offset+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
local_deriv##INDEX[get_local_id
(
0
)
]*0x100000000
))
;
#define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_deriv##INDEX[LOCAL_ID]*0x100000000)));
#else
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
derivBuffers##INDEX[offset]
+=
local_deriv##INDEX[
get_local_id
(
0
)
]
;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[
LOCAL_ID
];
#endif
/**
* Compute a force based on pair interactions.
*/
__kernel
void
computeN2Energy
(
KERNEL
void
computeN2Energy
(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
forceBuffers,
GLOBAL
mm_ulong
*
RESTRICT
forceBuffers
,
#else
__global
real4*
restrict
forceBuffers,
GLOBAL
real4
*
RESTRICT
forceBuffers
,
#endif
__global
mixed*
restrict
energyBuffer,
__local
real4*
restrict
local_force
,
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
ushort2*
exclusionTiles,
int
needEnergy,
GLOBAL
mixed
*
RESTRICT
energyBuffer
,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
unsigned
int
*
RESTRICT
exclusions
,
GLOBAL
const
ushort2
*
exclusionTiles
,
int
needEnergy
,
#ifdef USE_CUTOFF
__global
const
int*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockSize,
__global
const
int*
restrict
interactingAtoms
GLOBAL
const
int
*
RESTRICT
tiles
,
GLOBAL
const
unsigned
int
*
RESTRICT
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
GLOBAL
const
real4
*
RESTRICT
blockCenter
,
GLOBAL
const
real4
*
RESTRICT
blockSize
,
GLOBAL
const
int
*
RESTRICT
interactingAtoms
#else
unsigned
int
numTiles
#endif
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
get_global_size
(
0
)
/TILE_SIZE
;
const
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
totalWarps
=
GLOBAL_SIZE
/
TILE_SIZE
;
const
unsigned
int
warp
=
GLOBAL_ID
/
TILE_SIZE
;
const
unsigned
int
tgx
=
LOCAL_ID
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
LOCAL_ID
-
tgx
;
mixed
energy
=
0
;
INIT_PARAM_DERIVS
LOCAL
real3
local_pos
[
LOCAL_BUFFER_SIZE
];
LOCAL
real3
local_force
[
LOCAL_BUFFER_SIZE
];
ATOM_PARAMETER_DATA
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE+warp*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE+
(
warp+1
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/totalWarps
;
const
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real
4
force
=
0
;
real
3
force
=
make_real3
(
0
)
;
DECLARE_ATOM1_DERIVATIVES
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real
4
pos
q
1
=
posq[atom1]
;
real
3
pos1
=
trimTo3
(
posq
[
atom1
]
)
;
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
exclusions
[
pos
*
TILE_SIZE
+
tgx
];
...
...
@@ -53,14 +55,14 @@ __kernel void computeN2Energy(
if
(
x
==
y
)
{
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
local_pos
q
[localAtomIndex]
=
pos
q
1
;
const
unsigned
int
localAtomIndex
=
LOCAL_ID
;
local_pos
[
localAtomIndex
]
=
pos1
;
LOAD_LOCAL_PARAMETERS_FROM_1
SYNC_WARPS
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
j
;
real
4
pos
q
2
=
local_pos
q
[atom2]
;
real
4
delta
=
(
real
4
)
(
pos
q
2.x
yz
-
pos
q
1.
xyz,
0
)
;
real
3
pos2
=
local_pos
[
atom2
];
real
3
delta
=
make_
real
3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -84,8 +86,10 @@ __kernel void computeN2Energy(
}
if
(
needEnergy
)
energy
+=
0.5
f
*
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
#ifdef USE_CUTOFF
}
#endif
...
...
@@ -98,11 +102,11 @@ __kernel void computeN2Energy(
else
{
// This is an off-diagonal tile.
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
const
unsigned
int
localAtomIndex
=
LOCAL_ID
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
local_pos
q
[localAtomIndex]
=
posq[j]
;
local_pos
[
localAtomIndex
]
=
trimTo3
(
posq
[
j
]
)
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_force[localAtomIndex]
=
0
;
local_force
[
localAtomIndex
]
=
make_real3
(
0
)
;
CLEAR_LOCAL_DERIVATIVES
SYNC_WARPS
;
#ifdef USE_EXCLUSIONS
...
...
@@ -111,8 +115,8 @@ __kernel void computeN2Energy(
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real
4
pos
q
2 = local_pos
q
[atom2];
real
4
delta =
(
real
4)
(pos
q
2.x
yz -
pos
q
1.
xyz, 0
);
real
3
pos2
=
local_pos
[
atom2
];
real
3
delta
=
make_
real
3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -126,7 +130,7 @@ __kernel void computeN2Energy(
atom2
=
y
*
TILE_SIZE
+
tj
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
const real interactionScale = 1
.0f
;
const
real
interactionScale
=
1
;
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
...
...
@@ -136,10 +140,12 @@ __kernel void computeN2Energy(
}
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
atom2
=
tbx
+
tj
;
local_force[atom2]
.xyz
+= delta
.xyz
;
local_force
[
atom2
]
+=
delta
;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
...
...
@@ -151,20 +157,20 @@ __kernel void computeN2Energy(
SYNC_WARPS
;
}
}
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atom_add
(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add
(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add
(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
ATOMIC_ADD
(
&
forceBuffers
[
offset
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
x
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
y
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
z
*
0x100000000
))
)
;
STORE_DERIVATIVES_1
if
(
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
atom_add
(&forceBuffers[offset], (long) (local_force[
get_local_id(0)
].x*0x100000000));
atom_add
(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[
get_local_id(0)
].y*0x100000000));
atom_add
(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[
get_local_id(0)
].z*0x100000000));
ATOMIC_ADD
(
&
forceBuffers
[
offset
],
(
mm_ulong
)
((
mm_
long
)
(
local_force
[
LOCAL_ID
].
x
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
local_force
[
LOCAL_ID
].
y
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
local_force
[
LOCAL_ID
].
z
*
0x100000000
))
)
;
STORE_DERIVATIVES_2
}
#else
...
...
@@ -175,7 +181,7 @@ __kernel void computeN2Energy(
STORE_DERIVATIVES_1
if
(
x
!=
y
)
{
offset
=
offset2
;
forceBuffers[offset2] += (real4) (local_force[
get_local_id(0)
].x, local_force[
get_local_id(0)
].y, local_force[
get_local_id(0)
].z, 0.0f);
forceBuffers
[
offset2
]
+=
(
real4
)
(
local_force
[
LOCAL_ID
].
x
,
local_force
[
LOCAL_ID
].
y
,
local_force
[
LOCAL_ID
].
z
,
0.0
f
);
STORE_DERIVATIVES_2
}
#endif
...
...
@@ -188,21 +194,21 @@ __kernel void computeN2Energy(
unsigned
int
numTiles
=
interactionCount
[
0
];
if
(
numTiles
>
maxTiles
)
return
;
// There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int
pos
=
(
int
)
(
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
mm_
long
)
NUM_BLOCKS
+
1
)
/
2
:
(
mm_
long
)
numTiles
)
/
totalWarps
);
int
end
=
(
int
)
((
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
mm_
long
)
NUM_BLOCKS
+
1
)
/
2
:
(
mm_
long
)
numTiles
)
/
totalWarps
);
#else
int pos = (int) (warp*(long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long)numTiles/totalWarps);
int
pos
=
(
int
)
(
warp
*
(
mm_
long
)
numTiles
/
totalWarps
);
int
end
=
(
int
)
((
warp
+
1
)
*
(
mm_
long
)
numTiles
/
totalWarps
);
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__local
int atomIndices[
FORCE_WORK_GROUP
_SIZE];
__local
volatile int skipTiles[
FORCE_WORK_GROUP
_SIZE];
skipTiles[
get_local_id(0)
] = -1;
LOCAL
int
atomIndices
[
LOCAL_BUFFER
_SIZE
];
LOCAL
volatile
int
skipTiles
[
LOCAL_BUFFER
_SIZE
];
skipTiles
[
LOCAL_ID
]
=
-
1
;
while
(
pos
<
end
)
{
const
bool
isExcluded
=
false
;
real
4
force =
0
;
real
3
force
=
make_real3
(
0
)
;
DECLARE_ATOM1_DERIVATIVES
bool
includeTile
=
true
;
...
...
@@ -231,10 +237,10 @@ __kernel void computeN2Energy(
SYNC_WARPS
;
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles[
get_local_id
(
0
)
]
=
tile.x
+
tile.y*NUM_BLOCKS
-
tile.y*
(
tile.y+1
)
/2
;
skipTiles
[
LOCAL_ID
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles[
get_local_id
(
0
)
]
=
end
;
skipTiles
[
LOCAL_ID
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
SYNC_WARPS
;
...
...
@@ -247,20 +253,20 @@ __kernel void computeN2Energy(
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
real
4
pos
q
1
=
posq[atom1]
;
real
3
pos1
=
trimTo3
(
posq
[
atom1
]
)
;
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
const
unsigned
int
localAtomIndex
=
LOCAL_ID
;
#ifdef USE_CUTOFF
unsigned
int
j
=
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
];
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomIndices[
get_local_id
(
0
)
]
=
j
;
atomIndices
[
LOCAL_ID
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
local_pos
q
[localAtomIndex]
=
posq[j]
;
local_pos
[
localAtomIndex
]
=
trimTo3
(
posq
[
j
]
)
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_force[localAtomIndex]
=
0
;
local_force
[
localAtomIndex
]
=
make_real3
(
0
)
;
CLEAR_LOCAL_DERIVATIVES
}
SYNC_WARPS
;
...
...
@@ -270,14 +276,14 @@ __kernel void computeN2Energy(
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos
q
1,
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
local_pos
q[get_local_id
(
0
)
],
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos1
,
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
local_pos
[
LOCAL_ID
],
blockCenterX
)
SYNC_WARPS
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real
4
pos
q
2
=
local_pos
q
[atom2]
;
real
4
delta
=
(
real
4
)
(
pos
q
2.x
yz
-
pos
q
1.
xyz,
0
)
;
real
3
pos2
=
local_pos
[
atom2
];
real
3
delta
=
make_
real
3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
...
...
@@ -286,17 +292,19 @@ __kernel void computeN2Energy(
atom2
=
atomIndices
[
tbx
+
tj
];
real
dEdR
=
0
;
real
tempEnergy
=
0
;
const
real
interactionScale
=
1
.0f
;
const
real
interactionScale
=
1
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
atom2
=
tbx
+
tj
;
local_force[atom2]
.xyz
+=
delta
.xyz
;
local_force
[
atom2
]
+=
delta
;
RECORD_DERIVATIVE_2
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
...
...
@@ -311,8 +319,8 @@ __kernel void computeN2Energy(
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real
4
pos
q
2
=
local_pos
q
[atom2]
;
real
4
delta
=
(
real
4
)
(
pos
q
2.x
yz
-
pos
q
1.
xyz,
0
)
;
real
3
pos2
=
local_pos
[
atom2
];
real
3
delta
=
make_
real
3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -326,17 +334,19 @@ __kernel void computeN2Energy(
atom2
=
atomIndices
[
tbx
+
tj
];
real
dEdR
=
0
;
real
tempEnergy
=
0
;
const
real
interactionScale
=
1
.0f
;
const
real
interactionScale
=
1
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
atom2
=
tbx
+
tj
;
local_force[atom2]
.xyz
+=
delta
.xyz
;
local_force
[
atom2
]
+=
delta
;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
...
...
@@ -347,22 +357,22 @@ __kernel void computeN2Energy(
}
// Write results.
#ifdef USE_CUTOFF
unsigned
int
atom2
=
atomIndices[
get_local_id
(
0
)
]
;
unsigned
int
atom2
=
atomIndices
[
LOCAL_ID
];
#else
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add
(
&forceBuffers[atom1],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
ATOMIC_ADD
(
&
forceBuffers
[
atom1
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
x
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
(
mm_u
long
)
((
mm_long
)
(
force
.
y
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
z
*
0x100000000
))
)
;
unsigned
int
offset
=
atom1
;
STORE_DERIVATIVES_1
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
atom_add
(
&forceBuffers[atom2],
(
long
)
(
local_force[
get_local_id
(
0
)
].x*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+PADDED_NUM_ATOMS],
(
long
)
(
local_force[
get_local_id
(
0
)
].y*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+2*PADDED_NUM_ATOMS],
(
long
)
(
local_force[
get_local_id
(
0
)
].z*0x100000000
))
;
ATOMIC_ADD
(
&
forceBuffers
[
atom2
],
(
mm_ulong
)
((
mm_
long
)
(
local_force
[
LOCAL_ID
].
x
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
local_force
[
LOCAL_ID
].
y
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
local_force
[
LOCAL_ID
].
z
*
0x100000000
))
)
;
offset
=
atom2
;
STORE_DERIVATIVES_2
}
...
...
@@ -373,7 +383,7 @@ __kernel void computeN2Energy(
unsigned
int
offset
=
offset1
;
STORE_DERIVATIVES_1
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
forceBuffers[offset2]
+=
(
real4
)
(
local_force[
get_local_id
(
0
)
].x,
local_force[
get_local_id
(
0
)
].y,
local_force[
get_local_id
(
0
)
].z,
0.0f
)
;
forceBuffers
[
offset2
]
+=
(
real4
)
(
local_force
[
LOCAL_ID
].
x
,
local_force
[
LOCAL_ID
].
y
,
local_force
[
LOCAL_ID
].
z
,
0.0
f
);
offset
=
offset2
;
STORE_DERIVATIVES_2
}
...
...
@@ -381,6 +391,6 @@ __kernel void computeN2Energy(
}
pos
++
;
}
energyBuffer[
get_global_id
(
0
)
]
+=
energy
;
energyBuffer
[
GLOBAL_ID
]
+=
energy
;
SAVE_PARAM_DERIVS
}
platforms/
opencl
/src/kernels/customGBEnergyN2_cpu.c
l
→
platforms/
common
/src/kernels/customGBEnergyN2_cpu.c
c
View file @
5a06df78
#ifdef SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
define
STORE_DERIVATIVE_1
(
INDEX
)
atom_add
(
&derivBuffers[offset+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
deriv##INDEX##_1*0x100000000
))
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
atom_add
(
&derivBuffers[offset+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
local_deriv##INDEX[tgx]*0x100000000
))
;
#define STORE_DERIVATIVE_1(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) ATOMIC_ADD(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (local_deriv##INDEX[tgx]*0x100000000)));
#else
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[tgx];
...
...
@@ -10,30 +9,33 @@
/**
* Compute a force based on pair interactions.
*/
__kernel
void
computeN2Energy
(
KERNEL
void
computeN2Energy
(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
forceBuffers,
GLOBAL
mm_ulong
*
RESTRICT
forceBuffers
,
#else
__global
real4*
restrict
forceBuffers,
GLOBAL
real4
*
RESTRICT
forceBuffers
,
#endif
__global
mixed*
restrict
energyBuffer,
__local
real4*
restrict
local_force
,
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
ushort2*
exclusionTiles,
int
needEnergy,
GLOBAL
mixed
*
RESTRICT
energyBuffer
,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
unsigned
int
*
RESTRICT
exclusions
,
GLOBAL
const
ushort2
*
exclusionTiles
,
int
needEnergy
,
#ifdef USE_CUTOFF
__global
const
int*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockSize,
__global
const
int*
restrict
interactingAtoms
GLOBAL
const
int
*
RESTRICT
tiles
,
GLOBAL
const
unsigned
int
*
RESTRICT
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
GLOBAL
const
real4
*
RESTRICT
blockCenter
,
GLOBAL
const
real4
*
RESTRICT
blockSize
,
GLOBAL
const
int
*
RESTRICT
interactingAtoms
#else
unsigned
int
numTiles
#endif
PARAMETER_ARGUMENTS
)
{
mixed
energy
=
0
;
INIT_PARAM_DERIVS
LOCAL
real3
local_pos
[
LOCAL_BUFFER_SIZE
];
LOCAL
real3
local_force
[
LOCAL_BUFFER_SIZE
];
ATOM_PARAMETER_DATA
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE+
get_group_id
(
0
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/
get_num_groups
(
0
)
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE+
(
get_group_id
(
0
)
+1
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/
get_num_groups
(
0
)
;
const
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
GROUP_ID
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
NUM_GROUPS
;
const
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
GROUP_ID
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
NUM_GROUPS
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
...
...
@@ -43,7 +45,7 @@ __kernel void computeN2Energy(
for
(
int
localAtomIndex
=
0
;
localAtomIndex
<
TILE_SIZE
;
localAtomIndex
++
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
localAtomIndex
;
local_pos
q
[localAtomIndex]
=
posq[j]
;
local_pos
[
localAtomIndex
]
=
trimTo3
(
posq
[
j
]
)
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
if
(
x
==
y
)
{
...
...
@@ -56,15 +58,15 @@ __kernel void computeN2Energy(
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
force
=
0
;
DECLARE_ATOM1_DERIVATIVES
real
4
pos
q
1
=
posq[atom1]
;
real
3
pos1
=
trimTo3
(
posq
[
atom1
]
)
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real
4
pos
q
2
=
local_pos
q
[j]
;
real
4
delta
=
(
real
4
)
(
pos
q
2.x
yz
-
pos
q
1.
xyz,
0
)
;
real
3
pos2
=
local_pos
[
j
];
real
3
delta
=
make_
real
3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
dot
(
delta
.xyz
,
delta
.xyz
)
;
real
r2
=
dot
(
delta
,
delta
);
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
...
...
@@ -84,8 +86,10 @@ __kernel void computeN2Energy(
dEdR
/=
-
r
;
}
energy
+=
0.5
f
*
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
#ifdef USE_CUTOFF
}
#endif
...
...
@@ -98,12 +102,12 @@ __kernel void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset
=
atom1
;
atom_add
(
&forceBuffers[offset],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[offset+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[offset+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
x
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
y
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
z
*
0x100000000
))
)
;
STORE_DERIVATIVES_1
#else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
atom1
+
GROUP_ID
*
PADDED_NUM_ATOMS
;
forceBuffers
[
offset
].
xyz
+=
force
.
xyz
;
STORE_DERIVATIVES_1
#endif
...
...
@@ -123,11 +127,11 @@ __kernel void computeN2Energy(
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
force
=
0
;
DECLARE_ATOM1_DERIVATIVES
real
4
pos
q
1
=
posq[atom1]
;
real
3
pos1
=
trimTo3
(
posq
[
atom1
]
)
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real
4
pos
q
2
=
local_pos
q
[j]
;
real
4
delta
=
(
real
4
)
(
pos
q
2.x
yz
-
pos
q
1.
xyz,
0
)
;
real
3
pos2
=
local_pos
[
j
];
real
3
delta
=
make_
real
3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -153,8 +157,10 @@ __kernel void computeN2Energy(
dEdR
/=
-
r
;
}
energy
+=
tempEnergy
;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
atom2
=
j
;
local_force
[
atom2
].
xyz
+=
delta
.
xyz
;
RECORD_DERIVATIVE_2
...
...
@@ -170,12 +176,12 @@ __kernel void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset
=
atom1
;
atom_add
(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add
(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add
(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
ATOMIC_ADD
(
&
forceBuffers
[
offset
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
x
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
y
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
z
*
0x100000000
))
)
;
STORE_DERIVATIVES_1
#else
unsigned int offset = atom1 +
get_group_id(0)
*PADDED_NUM_ATOMS;
unsigned
int
offset
=
atom1
+
GROUP_ID
*
PADDED_NUM_ATOMS
;
forceBuffers
[
offset
].
xyz
+=
force
.
xyz
;
STORE_DERIVATIVES_1
#endif
...
...
@@ -186,12 +192,12 @@ __kernel void computeN2Energy(
for
(
int
tgx
=
0
;
tgx
<
TILE_SIZE
;
tgx
++
)
{
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
atom_add
(&forceBuffers[offset], (long) (local_force[tgx].x*0x100000000));
atom_add
(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[tgx].y*0x100000000));
atom_add
(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[tgx].z*0x100000000));
ATOMIC_ADD
(
&
forceBuffers
[
offset
],
(
mm_ulong
)
((
mm_
long
)
(
local_force
[
tgx
].
x
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
local_force
[
tgx
].
y
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
local_force
[
tgx
].
z
*
0x100000000
))
)
;
STORE_DERIVATIVES_2
#else
unsigned int offset = y*TILE_SIZE+tgx +
get_group_id(0)
*PADDED_NUM_ATOMS;
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
+
GROUP_ID
*
PADDED_NUM_ATOMS
;
forceBuffers
[
offset
].
xyz
+=
local_force
[
tgx
].
xyz
;
STORE_DERIVATIVES_2
#endif
...
...
@@ -206,15 +212,15 @@ __kernel void computeN2Energy(
const
unsigned
int
numTiles
=
interactionCount
[
0
];
if
(
numTiles
>
maxTiles
)
return
;
// There wasn't enough memory for the neighbor list.
int pos = (int) (
get_group_id(0)
*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/
get_num_groups(0)
);
int end = (int) ((
get_group_id(0)
+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/
get_num_groups(0)
);
int
pos
=
(
int
)
(
GROUP_ID
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
mm_
long
)
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
NUM_GROUPS
);
int
end
=
(
int
)
((
GROUP_ID
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
mm_
long
)
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
NUM_GROUPS
);
#else
int pos = (int) (
get_group_id(0)*(long)numTiles/get_num_groups(0)
);
int end = (int) ((
get_group_id(0)
+1)*(long)numTiles/
get_num_groups(0)
);
int
pos
=
(
int
)
(
GROUP_ID
*
(
mm_long
)
numTiles
/
NUM_GROUPS
);
int
end
=
(
int
)
((
GROUP_ID
+
1
)
*
(
mm_
long
)
numTiles
/
NUM_GROUPS
);
#endif
int
nextToSkip
=
-
1
;
int
currentSkipIndex
=
0
;
__local
int atomIndices[TILE_SIZE];
LOCAL
int
atomIndices
[
TILE_SIZE
];
while
(
pos
<
end
)
{
const
bool
isExcluded
=
false
;
...
...
@@ -261,7 +267,7 @@ __kernel void computeN2Energy(
#endif
atomIndices
[
localAtomIndex
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
local_pos
q
[localAtomIndex]
=
posq[j]
;
local_pos
[
localAtomIndex
]
=
trimTo3
(
posq
[
j
]
)
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_force
[
localAtomIndex
]
=
0
;
CLEAR_LOCAL_DERIVATIVES
...
...
@@ -274,17 +280,17 @@ __kernel void computeN2Energy(
real4
blockCenterX
=
blockCenter
[
x
];
for
(
unsigned
int
tgx
=
0
;
tgx
<
TILE_SIZE
;
tgx
++
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
local_pos
q
[tgx],
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
local_pos
[
tgx
],
blockCenterX
)
for
(
unsigned
int
tgx
=
0
;
tgx
<
TILE_SIZE
;
tgx
++
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
force
=
0
;
DECLARE_ATOM1_DERIVATIVES
real
4
pos
q
1
=
posq[atom1]
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos
q
1,
blockCenterX
)
real
3
pos1
=
trimTo3
(
posq
[
atom1
]
)
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos1
,
blockCenterX
)
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real
4
pos
q
2
=
local_pos
q
[j]
;
real
4
delta
=
(
real
4
)
(
pos
q
2.x
yz
-
pos
q
1.
xyz,
0
)
;
real
3
pos2
=
local_pos
[
j
];
real
3
delta
=
make_
real
3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
real
r2
=
dot
(
delta
.
xyz
,
delta
.
xyz
);
if
(
atom1
<
NUM_ATOMS
&&
atomIndices
[
j
]
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
...
...
@@ -298,8 +304,10 @@ __kernel void computeN2Energy(
COMPUTE_INTERACTION
dEdR
/=
-
r
;
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
atom2
=
j
;
local_force
[
atom2
].
xyz
+=
delta
.
xyz
;
RECORD_DERIVATIVE_2
...
...
@@ -310,12 +318,12 @@ __kernel void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset
=
atom1
;
atom_add
(
&forceBuffers[offset],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[offset+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[offset+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
x
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
y
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
z
*
0x100000000
))
)
;
STORE_DERIVATIVES_1
#else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
atom1
+
GROUP_ID
*
PADDED_NUM_ATOMS
;
forceBuffers
[
offset
].
xyz
+=
force
.
xyz
;
STORE_DERIVATIVES_1
#endif
...
...
@@ -330,11 +338,11 @@ __kernel void computeN2Energy(
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
force
=
0
;
DECLARE_ATOM1_DERIVATIVES
real
4
pos
q
1
=
posq[atom1]
;
real
3
pos1
=
trimTo3
(
posq
[
atom1
]
)
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real
4
pos
q
2
=
local_pos
q
[j]
;
real
4
delta
=
(
real
4
)
(
pos
q
2.x
yz
-
pos
q
1.
xyz,
0
)
;
real
3
pos2
=
local_pos
[
j
];
real
3
delta
=
make_
real
3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -355,10 +363,12 @@ __kernel void computeN2Energy(
COMPUTE_INTERACTION
dEdR
/=
-
r
;
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
atom2
=
j
;
local_force[atom2]
.xyz
+=
delta
.xyz
;
local_force
[
atom2
]
+=
delta
;
RECORD_DERIVATIVE_2
}
}
...
...
@@ -367,12 +377,12 @@ __kernel void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset
=
atom1
;
atom_add
(
&forceBuffers[offset],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[offset+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[offset+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
x
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
y
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
z
*
0x100000000
))
)
;
STORE_DERIVATIVES_1
#else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
atom1
+
GROUP_ID
*
PADDED_NUM_ATOMS
;
forceBuffers
[
offset
].
xyz
+=
force
.
xyz
;
STORE_DERIVATIVES_1
#endif
...
...
@@ -389,13 +399,13 @@ __kernel void computeN2Energy(
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add
(
&forceBuffers[atom2],
(
long
)
(
local_force[tgx].x*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+PADDED_NUM_ATOMS],
(
long
)
(
local_force[tgx].y*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+2*PADDED_NUM_ATOMS],
(
long
)
(
local_force[tgx].z*0x100000000
))
;
ATOMIC_ADD
(
&
forceBuffers
[
atom2
],
(
mm_ulong
)
((
mm_
long
)
(
local_force
[
tgx
].
x
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
local_force
[
tgx
].
y
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
local_force
[
tgx
].
z
*
0x100000000
))
)
;
unsigned
int
offset
=
atom2
;
STORE_DERIVATIVES_2
#else
unsigned
int
offset
=
atom2
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
atom2
+
GROUP_ID
*
PADDED_NUM_ATOMS
;
forceBuffers
[
offset
].
xyz
+=
local_force
[
tgx
].
xyz
;
STORE_DERIVATIVES_2
#endif
...
...
@@ -404,6 +414,6 @@ __kernel void computeN2Energy(
}
pos
++
;
}
energyBuffer[
get_global_id
(
0
)
]
+=
energy
;
energyBuffer
[
GLOBAL_ID
]
+=
energy
;
SAVE_PARAM_DERIVS
}
platforms/
opencl
/src/kernels/customGBEnergyPerParticle.c
l
→
platforms/
common
/src/kernels/customGBEnergyPerParticle.c
c
View file @
5a06df78
...
...
@@ -9,24 +9,29 @@
* Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
*/
__kernel
void
computePerParticleEnergy
(
int
bufferSize,
int
numBuffers,
__global
real4*
restrict
forceBuffers,
__global
mixed*
restrict
energyBuffer,
__global
const
real4*
restrict
posq
KERNEL
void
computePerParticleEnergy
(
GLOBAL
mixed
*
RESTRICT
energyBuffer
,
GLOBAL
const
real4
*
RESTRICT
posq
,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL
mm_long
*
RESTRICT
forceBuffers
#else
GLOBAL
real4
*
RESTRICT
forceBuffers
,
int
bufferSize
,
int
numBuffers
#endif
PARAMETER_ARGUMENTS
)
{
mixed
energy
=
0
;
INIT_PARAM_DERIVS
unsigned
int
index
=
get_global_id
(
0
)
;
while
(
index
<
NUM_ATOMS
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_ATOMS
;
index
+=
GLOBAL_SIZE
)
{
// Reduce the derivatives
#ifndef SUPPORTS_64_BIT_ATOMICS
int
totalSize
=
bufferSize
*
numBuffers
;
#endif
REDUCE_DERIVATIVES
// Now calculate the per-particle energy terms.
real4
pos
=
posq
[
index
];
real
4
force
=
(
real4
)
0
;
real
3
force
=
make_real3
(
0
,
0
,
0
)
;
COMPUTE_ENERGY
index
+=
get_global_size
(
0
)
;
}
energyBuffer[
get_global_id
(
0
)
]
+=
energy
;
energyBuffer
[
GLOBAL_ID
]
+=
energy
;
SAVE_PARAM_DERIVS
}
platforms/c
uda
/src/kernels/customGBGradientChainRule.c
u
→
platforms/c
ommon
/src/kernels/customGBGradientChainRule.c
c
View file @
5a06df78
...
...
@@ -2,17 +2,30 @@
* Compute chain rule terms for computed values that depend explicitly on particle coordinates.
*/
extern
"C"
__global__
void
computeGradientChainRuleTerms
(
long
long
*
__restrict__
forceBuffers
,
const
real4
*
__restrict__
posq
KERNEL
void
computeGradientChainRuleTerms
(
GLOBAL
const
real4
*
RESTRICT
posq
,
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL
mm_long
*
RESTRICT
forceBuffers
#else
GLOBAL
real4
*
RESTRICT
forceBuffers
#endif
PARAMETER_ARGUMENTS
)
{
INIT_PARAM_DERIVS
const
real
scale
=
RECIP
((
real
)
0x100000000
);
for
(
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_ATOMS
;
index
+=
GLOBAL_SIZE
)
{
real4
pos
=
posq
[
index
];
#ifdef SUPPORTS_64_BIT_ATOMICS
real3
force
=
make_real3
(
scale
*
forceBuffers
[
index
],
scale
*
forceBuffers
[
index
+
PADDED_NUM_ATOMS
],
scale
*
forceBuffers
[
index
+
PADDED_NUM_ATOMS
*
2
]);
#else
real3
force
=
trimTo3
(
forceBuffers
[
index
]);
#endif
COMPUTE_FORCES
forceBuffers
[
index
]
=
(
long
long
)
(
force
.
x
*
0x100000000
);
forceBuffers
[
index
+
PADDED_NUM_ATOMS
]
=
(
long
long
)
(
force
.
y
*
0x100000000
);
forceBuffers
[
index
+
PADDED_NUM_ATOMS
*
2
]
=
(
long
long
)
(
force
.
z
*
0x100000000
);
#ifdef SUPPORTS_64_BIT_ATOMICS
forceBuffers
[
index
]
=
(
mm_long
)
(
force
.
x
*
0x100000000
);
forceBuffers
[
index
+
PADDED_NUM_ATOMS
]
=
(
mm_long
)
(
force
.
y
*
0x100000000
);
forceBuffers
[
index
+
PADDED_NUM_ATOMS
*
2
]
=
(
mm_long
)
(
force
.
z
*
0x100000000
);
#else
forceBuffers
[
index
]
=
make_real4
(
force
.
x
,
force
.
y
,
force
.
z
,
0
);
#endif
}
SAVE_PARAM_DERIVS
}
platforms/
opencl
/src/kernels/customGBValueN2.c
l
→
platforms/
common
/src/kernels/customGBValueN2.c
c
View file @
5a06df78
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
/**
* Compute a value based on pair interactions.
*/
__kernel
void
computeN2Value
(
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
ushort2*
exclusionTiles,
KERNEL
void
computeN2Value
(
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
unsigned
int
*
RESTRICT
exclusions
,
GLOBAL
const
ushort2
*
exclusionTiles
,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
global_value,
GLOBAL
mm_ulong
*
RESTRICT
global_value
,
#else
__global
real*
restrict
global_value,
GLOBAL
real
*
RESTRICT
global_value
,
#endif
__local
real*
restrict
local_value,
#ifdef USE_CUTOFF
__global
const
int*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockSize,
__global
const
int*
restrict
interactingAtoms
GLOBAL
const
int
*
RESTRICT
tiles
,
GLOBAL
const
unsigned
int
*
RESTRICT
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
GLOBAL
const
real4
*
RESTRICT
blockCenter
,
GLOBAL
const
real4
*
RESTRICT
blockSize
,
GLOBAL
const
int
*
RESTRICT
interactingAtoms
#else
unsigned
int
numTiles
#endif
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
get_global_size
(
0
)
/TILE_SIZE
;
const
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
totalWarps
=
GLOBAL_SIZE
/
TILE_SIZE
;
const
unsigned
int
warp
=
GLOBAL_ID
/
TILE_SIZE
;
const
unsigned
int
tgx
=
LOCAL_ID
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
LOCAL_ID
-
tgx
;
LOCAL
real3
local_pos
[
LOCAL_BUFFER_SIZE
];
LOCAL
real
local_value
[
LOCAL_BUFFER_SIZE
];
ATOM_PARAMETER_DATA
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE+warp*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE+
(
warp+1
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/totalWarps
;
const
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real
value
=
0
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real
4
pos
q
1
=
posq[atom1]
;
real
3
pos1
=
trimTo3
(
posq
[
atom1
]
)
;
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
exclusions
[
pos
*
TILE_SIZE
+
tgx
];
...
...
@@ -44,14 +42,14 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
if
(
x
==
y
)
{
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
local_pos
q
[localAtomIndex]
=
pos
q
1
;
const
unsigned
int
localAtomIndex
=
LOCAL_ID
;
local_pos
[
localAtomIndex
]
=
pos1
;
LOAD_LOCAL_PARAMETERS_FROM_1
SYNC_WARPS
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
j
;
real
4
pos
q
2
=
local_pos
q
[atom2]
;
real
4
delta
=
(
real
4
)
(
pos
q
2.x
yz
-
pos
q
1.
xyz,
0
)
;
real
3
pos2
=
local_pos
[
atom2
];
real
3
delta
=
make_
real
3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -87,9 +85,9 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
else
{
// This is an off-diagonal tile.
const unsigned int localAtomIndex =
get_local_id(0)
;
const
unsigned
int
localAtomIndex
=
LOCAL_ID
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
local_pos
q
[localAtomIndex] = posq[j];
local_pos
[
localAtomIndex
]
=
trimTo3
(
posq
[
j
]
)
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_value
[
localAtomIndex
]
=
0
;
SYNC_WARPS
;
...
...
@@ -99,8 +97,8 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real
4
pos
q
2
=
local_pos
q
[atom2]
;
real
4
delta
=
(
real
4
)
(
pos
q
2.x
yz
-
pos
q
1.
xyz,
0
)
;
real
3
pos2
=
local_pos
[
atom2
];
real
3
delta
=
make_
real
3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -141,11 +139,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset1
=
x
*
TILE_SIZE
+
tgx
;
atom_add
(&global_value[offset1], (long) (value*0x100000000));
ATOMIC_ADD
(
&
global_value
[
offset1
],
(
mm_ulong
)
((
mm_
long
)
(
value
*
0x100000000
))
)
;
STORE_PARAM_DERIVS1
if
(
x
!=
y
)
{
unsigned
int
offset2
=
y
*
TILE_SIZE
+
tgx
;
atom_add
(&global_value[offset2], (long) (local_value[
get_local_id(0)
]*0x100000000));
ATOMIC_ADD
(
&
global_value
[
offset2
],
(
mm_ulong
)
((
mm_
long
)
(
local_value
[
LOCAL_ID
]
*
0x100000000
))
)
;
STORE_PARAM_DERIVS2
}
#else
...
...
@@ -154,7 +152,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
global_value
[
offset1
]
+=
value
;
STORE_PARAM_DERIVS1
if
(
x
!=
y
)
{
global_value[offset2] += local_value[
get_local_id(0)
];
global_value
[
offset2
]
+=
local_value
[
LOCAL_ID
];
STORE_PARAM_DERIVS2
}
#endif
...
...
@@ -167,17 +165,17 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
unsigned
int
numTiles
=
interactionCount
[
0
];
if
(
numTiles
>
maxTiles
)
return
;
// There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int
pos
=
(
int
)
(
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
mm_
long
)
NUM_BLOCKS
+
1
)
/
2
:
(
mm_
long
)
numTiles
)
/
totalWarps
);
int
end
=
(
int
)
((
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
mm_
long
)
NUM_BLOCKS
+
1
)
/
2
:
(
mm_
long
)
numTiles
)
/
totalWarps
);
#else
int pos = (int) (warp*(long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long)numTiles/totalWarps);
int
pos
=
(
int
)
(
warp
*
(
mm_
long
)
numTiles
/
totalWarps
);
int
end
=
(
int
)
((
warp
+
1
)
*
(
mm_
long
)
numTiles
/
totalWarps
);
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__local
int atomIndices[
FORCE_WORK_GROUP
_SIZE];
__local
volatile int skipTiles[
FORCE_WORK_GROUP
_SIZE];
skipTiles[
get_local_id(0)
] = -1;
LOCAL
int
atomIndices
[
LOCAL_BUFFER
_SIZE
];
LOCAL
volatile
int
skipTiles
[
LOCAL_BUFFER
_SIZE
];
skipTiles
[
LOCAL_ID
]
=
-
1
;
while
(
pos
<
end
)
{
real
value
=
0
;
...
...
@@ -208,10 +206,10 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
SYNC_WARPS
;
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles[
get_local_id
(
0
)
]
=
tile.x
+
tile.y*NUM_BLOCKS
-
tile.y*
(
tile.y+1
)
/2
;
skipTiles
[
LOCAL_ID
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles[
get_local_id
(
0
)
]
=
end
;
skipTiles
[
LOCAL_ID
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
SYNC_WARPS
;
...
...
@@ -225,17 +223,17 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
// Load atom data for this tile.
real
4
pos
q
1
=
posq[atom1]
;
real
3
pos1
=
trimTo3
(
posq
[
atom1
]
)
;
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
const
unsigned
int
localAtomIndex
=
LOCAL_ID
;
#ifdef USE_CUTOFF
unsigned
int
j
=
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
];
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomIndices[
get_local_id
(
0
)
]
=
j
;
atomIndices
[
LOCAL_ID
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
local_pos
q
[localAtomIndex]
=
posq[j]
;
local_pos
[
localAtomIndex
]
=
trimTo3
(
posq
[
j
]
)
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_value
[
localAtomIndex
]
=
0
;
}
...
...
@@ -246,14 +244,14 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos
q
1,
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
local_pos
q[get_local_id
(
0
)
],
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos1
,
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
local_pos
[
LOCAL_ID
],
blockCenterX
)
SYNC_WARPS
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real
4
pos
q
2
=
local_pos
q
[atom2]
;
real
4
delta
=
(
real
4
)
(
pos
q
2.x
yz
-
pos
q
1.
xyz,
0
)
;
real
3
pos2
=
local_pos
[
atom2
];
real
3
delta
=
make_
real
3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
...
...
@@ -278,12 +276,12 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real
4
pos
q
2
=
local_pos
q
[atom2]
;
real
4
delta
=
(
real
4
)
(
pos
q
2.x
yz
-
pos
q
1.
xyz,
0
)
;
real
3
pos2
=
local_pos
[
atom2
];
real
3
delta
=
make_
real
3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -313,19 +311,19 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
}
// Write results.
#ifdef USE_CUTOFF
unsigned
int
atom2
=
atomIndices[
get_local_id
(
0
)
]
;
unsigned
int
atom2
=
atomIndices
[
LOCAL_ID
];
#else
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset1
=
atom1
;
atom_add
(
&global_value[offset1],
(
long
)
(
value*0x100000000
))
;
ATOMIC_ADD
(
&
global_value
[
offset1
],
(
mm_ulong
)
((
mm_
long
)
(
value
*
0x100000000
))
)
;
STORE_PARAM_DERIVS1
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
unsigned
int
offset2
=
atom2
;
atom_add
(
&global_value[offset2],
(
long
)
(
local_value[
get_local_id
(
0
)
]*0x100000000
))
;
ATOMIC_ADD
(
&
global_value
[
offset2
],
(
mm_ulong
)
((
mm_
long
)
(
local_value
[
LOCAL_ID
]
*
0x100000000
))
)
;
STORE_PARAM_DERIVS2
}
#else
...
...
@@ -334,7 +332,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
STORE_PARAM_DERIVS1
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
unsigned
int
offset2
=
atom2
+
warp
*
PADDED_NUM_ATOMS
;
global_value[offset2]
+=
local_value[
get_local_id
(
0
)
]
;
global_value
[
offset2
]
+=
local_value
[
LOCAL_ID
];
STORE_PARAM_DERIVS2
}
#endif
...
...
platforms/
opencl
/src/kernels/customGBValueN2_cpu.c
l
→
platforms/
common
/src/kernels/customGBValueN2_cpu.c
c
View file @
5a06df78
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
/**
* Compute a value based on pair interactions.
*/
__kernel
void
computeN2Value
(
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
ushort2*
exclusionTiles,
KERNEL
void
computeN2Value
(
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
unsigned
int
*
RESTRICT
exclusions
,
GLOBAL
const
ushort2
*
exclusionTiles
,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
global_value,
GLOBAL
mm_ulong
*
RESTRICT
global_value
,
#else
__global
real*
restrict
global_value,
GLOBAL
real
*
RESTRICT
global_value
,
#endif
__local
real*
restrict
local_value,
#ifdef USE_CUTOFF
__global
const
int*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockSize,
__global
const
int*
restrict
interactingAtoms
GLOBAL
const
int
*
RESTRICT
tiles
,
GLOBAL
const
unsigned
int
*
RESTRICT
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
GLOBAL
const
real4
*
RESTRICT
blockCenter
,
GLOBAL
const
real4
*
RESTRICT
blockSize
,
GLOBAL
const
int
*
RESTRICT
interactingAtoms
#else
unsigned
int
numTiles
#endif
PARAMETER_ARGUMENTS
)
{
LOCAL
real3
local_pos
[
LOCAL_BUFFER_SIZE
];
LOCAL
real
local_value
[
LOCAL_BUFFER_SIZE
];
ATOM_PARAMETER_DATA
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE+get_group_id
(
0
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/get_num_groups
(
0
)
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE+
(
get_group_id
(
0
)
+1
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/get_num_groups
(
0
)
;
const
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
get_group_id
(
0
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
get_num_groups
(
0
);
const
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
get_group_id
(
0
)
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
get_num_groups
(
0
);
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
...
...
@@ -35,7 +33,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
for
(
int
localAtomIndex
=
0
;
localAtomIndex
<
TILE_SIZE
;
localAtomIndex
++
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
localAtomIndex
;
local_pos
q
[localAtomIndex]
=
posq[j]
;
local_pos
[
localAtomIndex
]
=
trimTo3
(
posq
[
j
]
)
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
if
(
x
==
y
)
{
...
...
@@ -47,11 +45,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#endif
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real
value
=
0
;
real
4
pos
q
1
=
posq[atom1]
;
real
3
pos1
=
trimTo3
(
posq
[
atom1
]
)
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real
4
pos
q
2
=
local_pos
q
[j]
;
real
4
delta
=
(
real
4
)
(
pos
q
2.x
yz
-
pos
q
1.
xyz,
0
)
;
real
3
pos2
=
local_pos
[
j
];
real
3
delta
=
make_
real
3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -88,7 +86,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset1
=
atom1
;
atom_add
(&global_value[offset1], (long) (value*0x100000000));
ATOMIC_ADD
(
&
global_value
[
offset1
],
(
mm_ulong
)
((
mm_
long
)
(
value
*
0x100000000
))
)
;
#else
unsigned
int
offset1
=
atom1
+
get_group_id
(
0
)
*
PADDED_NUM_ATOMS
;
global_value
[
offset1
]
+=
value
;
...
...
@@ -107,11 +105,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#endif
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real
value
=
0
;
real
4
pos
q
1 = posq[atom1];
real
3
pos1
=
trimTo3
(
posq
[
atom1
]
)
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real
4
pos
q
2 = local_pos
q
[j];
real
4
delta =
(
real
4)
(pos
q
2.x
yz -
pos
q
1.
xyz, 0
);
real
3
pos2
=
local_pos
[
j
];
real
3
delta
=
make_
real
3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -150,7 +148,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset1
=
atom1
;
atom_add
(&global_value[offset1], (long) (value*0x100000000));
ATOMIC_ADD
(
&
global_value
[
offset1
],
(
mm_ulong
)
((
mm_
long
)
(
value
*
0x100000000
))
)
;
#else
unsigned
int
offset1
=
atom1
+
get_group_id
(
0
)
*
PADDED_NUM_ATOMS
;
global_value
[
offset1
]
+=
value
;
...
...
@@ -163,7 +161,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
for
(
int
tgx
=
0
;
tgx
<
TILE_SIZE
;
tgx
++
)
{
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset2
=
y
*
TILE_SIZE
+
tgx
;
atom_add
(&global_value[offset2], (long) (local_value[tgx]*0x100000000));
ATOMIC_ADD
(
&
global_value
[
offset2
],
(
mm_ulong
)
((
mm_
long
)
(
local_value
[
tgx
]
*
0x100000000
))
)
;
#else
unsigned
int
offset2
=
y
*
TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*
PADDED_NUM_ATOMS
;
global_value
[
offset2
]
+=
local_value
[
tgx
];
...
...
@@ -180,15 +178,15 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
const
unsigned
int
numTiles
=
interactionCount
[
0
];
if
(
numTiles
>
maxTiles
)
return
;
// There wasn't enough memory for the neighbor list.
int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
int
pos
=
(
int
)
(
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
mm_
long
)
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
get_num_groups
(
0
));
int
end
=
(
int
)
((
get_group_id
(
0
)
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
mm_
long
)
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
get_num_groups
(
0
));
#else
int pos = (int) (get_group_id(0)*(long)numTiles/get_num_groups(0));
int end = (int) ((get_group_id(0)+1)*(long)numTiles/get_num_groups(0));
int
pos
=
(
int
)
(
get_group_id
(
0
)
*
(
mm_
long
)
numTiles
/
get_num_groups
(
0
));
int
end
=
(
int
)
((
get_group_id
(
0
)
+
1
)
*
(
mm_
long
)
numTiles
/
get_num_groups
(
0
));
#endif
int
nextToSkip
=
-
1
;
int
currentSkipIndex
=
0
;
__local
int atomIndices[TILE_SIZE];
LOCAL
int
atomIndices
[
TILE_SIZE
];
while
(
pos
<
end
)
{
bool
includeTile
=
true
;
...
...
@@ -234,7 +232,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#endif
atomIndices
[
localAtomIndex
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
local_pos
q
[localAtomIndex]
=
posq[j]
;
local_pos
[
localAtomIndex
]
=
trimTo3
(
posq
[
j
]
)
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_value
[
localAtomIndex
]
=
0
;
}
...
...
@@ -246,16 +244,16 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
real4
blockCenterX
=
blockCenter
[
x
];
for
(
unsigned
int
tgx
=
0
;
tgx
<
TILE_SIZE
;
tgx
++
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
local_pos
q
[tgx],
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
local_pos
[
tgx
],
blockCenterX
)
for
(
unsigned
int
tgx
=
0
;
tgx
<
TILE_SIZE
;
tgx
++
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real
value
=
0
;
real
4
pos
q
1
=
posq[atom1]
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos
q
1,
blockCenterX
)
real
3
pos1
=
trimTo3
(
posq
[
atom1
]
)
;
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos1
,
blockCenterX
)
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real
4
pos
q
2
=
local_pos
q
[j]
;
real
4
delta
=
(
real
4
)
(
pos
q
2.x
yz
-
pos
q
1.
xyz,
0
)
;
real
3
pos2
=
local_pos
[
j
];
real
3
delta
=
make_
real
3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
real
r2
=
dot
(
delta
.
xyz
,
delta
.
xyz
);
if
(
atom1
<
NUM_ATOMS
&&
atomIndices
[
j
]
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
...
...
@@ -277,7 +275,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset1
=
atom1
;
atom_add
(
&global_value[offset1],
(
long
)
(
value*0x100000000
))
;
ATOMIC_ADD
(
&
global_value
[
offset1
],
(
mm_ulong
)
((
mm_
long
)
(
value
*
0x100000000
))
)
;
#else
unsigned
int
offset1
=
atom1
+
get_group_id
(
0
)
*
PADDED_NUM_ATOMS
;
global_value
[
offset1
]
+=
value
;
...
...
@@ -293,11 +291,11 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
for
(
unsigned
int
tgx
=
0
;
tgx
<
TILE_SIZE
;
tgx
++
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real
value
=
0
;
real
4
pos
q
1
=
posq[atom1]
;
real
3
pos1
=
trimTo3
(
posq
[
atom1
]
)
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real
4
pos
q
2
=
local_pos
q
[j]
;
real
4
delta
=
(
real
4
)
(
pos
q
2.x
yz
-
pos
q
1.
xyz,
0
)
;
real
3
pos2
=
local_pos
[
j
];
real
3
delta
=
make_
real
3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -326,7 +324,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset1
=
atom1
;
atom_add
(
&global_value[offset1],
(
long
)
(
value*0x100000000
))
;
ATOMIC_ADD
(
&
global_value
[
offset1
],
(
mm_ulong
)
((
mm_
long
)
(
value
*
0x100000000
))
)
;
#else
unsigned
int
offset1
=
atom1
+
get_group_id
(
0
)
*
PADDED_NUM_ATOMS
;
global_value
[
offset1
]
+=
value
;
...
...
@@ -346,7 +344,7 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset2
=
atom2
;
atom_add
(
&global_value[offset2],
(
long
)
(
local_value[tgx]*0x100000000
))
;
ATOMIC_ADD
(
&
global_value
[
offset2
],
(
mm_ulong
)
((
mm_
long
)
(
local_value
[
tgx
]
*
0x100000000
))
)
;
#else
unsigned
int
offset2
=
atom2
+
get_group_id
(
0
)
*
PADDED_NUM_ATOMS
;
global_value
[
offset2
]
+=
local_value
[
tgx
];
...
...
platforms/
opencl
/src/kernels/customGBValuePerParticle.c
l
→
platforms/
common
/src/kernels/customGBValuePerParticle.c
c
View file @
5a06df78
...
...
@@ -2,19 +2,18 @@
* Reduce a pairwise computed value, and compute per-particle values.
*/
__kernel
void
computePerParticleValues
(
int
bufferSize,
int
numBuffers,
__global
real4*
posq,
KERNEL
void
computePerParticleValues
(
GLOBAL
real4
*
posq
,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global
long*
valueBuffers
GLOBAL
mm_
long
*
valueBuffers
#else
__global
real*
valueBuffers
GLOBAL
real
*
valueBuffers
,
int
bufferSize
,
int
numBuffers
#endif
PARAMETER_ARGUMENTS
)
{
unsigned
int
index
=
get_global_id
(
0
)
;
while
(
index
<
NUM_ATOMS
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_ATOMS
;
index
+=
GLOBAL_SIZE
)
{
// Reduce the pairwise value
#ifdef SUPPORTS_64_BIT_ATOMICS
real
sum
=
(
1.0f/0x100000000
)
*
valueBuffers[index]
;
real
sum
=
valueBuffers
[
index
]
/
(
real
)
0x100000000
;
#else
int
totalSize
=
bufferSize
*
numBuffers
;
real
sum
=
valueBuffers
[
index
];
...
...
@@ -27,6 +26,5 @@ __kernel void computePerParticleValues(int bufferSize, int numBuffers, __global
real4
pos
=
posq
[
index
];
COMPUTE_VALUES
index
+=
get_global_size
(
0
)
;
}
}
platforms/
opencl
/src/kernels/customHbondForce.c
l
→
platforms/
common
/src/kernels/customHbondForce.c
c
View file @
5a06df78
...
...
@@ -2,8 +2,8 @@
* Compute the difference between two vectors, optionally taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude.
*/
real4
delta
(
real4
vec1,
real4
vec2,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ
)
{
real4
result
=
(
real4
)
(
vec1.x-vec2.x,
vec1.y-vec2.y,
vec1.z-vec2.z,
0
)
;
inline
DEVICE
real4
delta
(
real4
vec1
,
real4
vec2
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
)
{
real4
result
=
make_
real4
(
vec1
.
x
-
vec2
.
x
,
vec1
.
y
-
vec2
.
y
,
vec1
.
z
-
vec2
.
z
,
0
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
result
)
#endif
...
...
@@ -14,73 +14,79 @@ real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxS
/**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/
real
computeAngle
(
real4
vec1,
real4
vec2
)
{
inline
DEVICE
real
computeAngle
(
real4
vec1
,
real4
vec2
)
{
real
dotProduct
=
vec1
.
x
*
vec2
.
x
+
vec1
.
y
*
vec2
.
y
+
vec1
.
z
*
vec2
.
z
;
real
cosine
=
dotProduct
*
RSQRT
(
vec1
.
w
*
vec2
.
w
);
real
angle
;
if
(
cosine
>
0.99
f
||
cosine
<
-
0.99
f
)
{
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
real
4
crossProduct = cross(vec1, vec2);
real
3
crossProduct
=
cross
(
trimTo3
(
vec1
)
,
trimTo3
(
vec2
)
)
;
real
scale
=
vec1
.
w
*
vec2
.
w
;
angle =
asin
(SQRT(dot(crossProduct, crossProduct)/scale));
if (cosine < 0
.0f
)
angle = PI-angle;
angle
=
ASIN
(
SQRT
(
dot
(
crossProduct
,
crossProduct
)
/
scale
));
if
(
cosine
<
0
)
angle
=
M_
PI
-
angle
;
}
else
angle =
acos
(cosine);
angle
=
ACOS
(
cosine
);
return
angle
;
}
/**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/
real4 computeCross(real4 vec1, real4 vec2) {
real4 result = cross(vec1, vec2);
result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result;
inline
DEVICE
real4
computeCross
(
real4
vec1
,
real4
vec2
)
{
real3
cp
=
cross
(
trimTo3
(
vec1
),
trimTo3
(
vec2
));
return
make_real4
(
cp
.
x
,
cp
.
y
,
cp
.
z
,
cp
.
x
*
cp
.
x
+
cp
.
y
*
cp
.
y
+
cp
.
z
*
cp
.
z
);
}
/**
* Compute forces on donors.
*/
__kernel void computeDonorForces(__global real4* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict exclusions,
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict donorBufferIndices, __local real4* posBuffer, real4 periodicBoxSize, real4 invPeriodicBoxSize,
KERNEL
void
computeDonorForces
(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL
mm_ulong
*
RESTRICT
force
,
#else
GLOBAL
real4
*
RESTRICT
forceBuffers
,
GLOBAL
const
int4
*
RESTRICT
donorBufferIndices
,
#endif
GLOBAL
mixed
*
RESTRICT
energyBuffer
,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
int4
*
RESTRICT
exclusions
,
GLOBAL
const
int4
*
RESTRICT
donorAtoms
,
GLOBAL
const
int4
*
RESTRICT
acceptorAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
PARAMETER_ARGUMENTS
)
{
LOCAL
real4
posBuffer
[
3
*
THREAD_BLOCK_SIZE
];
mixed
energy
=
0
;
real
4
f1 =
(real4) 0
;
real
4
f2 =
(real4) 0
;
real
4
f3 =
(real4) 0
;
for (int donorStart = 0; donorStart < NUM_DONORS; donorStart +=
get_global_size(0)
) {
real
3
f1
=
make_real3
(
0
)
;
real
3
f2
=
make_real3
(
0
)
;
real
3
f3
=
make_real3
(
0
)
;
for
(
int
donorStart
=
0
;
donorStart
<
NUM_DONORS
;
donorStart
+=
GLOBAL_SIZE
)
{
// Load information about the donor this thread will compute forces on.
int donorIndex = donorStart+
get_global_id(0)
;
int
donorIndex
=
donorStart
+
GLOBAL_ID
;
int4
atoms
,
exclusionIndices
;
real4
d1
,
d2
,
d3
;
if
(
donorIndex
<
NUM_DONORS
)
{
atoms
=
donorAtoms
[
donorIndex
];
d1 = (atoms.x > -1 ? posq[atoms.x] :
(
real4
) 0
);
d2 = (atoms.y > -1 ? posq[atoms.y] :
(
real4
) 0
);
d3 = (atoms.z > -1 ? posq[atoms.z] :
(
real4
) 0
);
d1
=
(
atoms
.
x
>
-
1
?
posq
[
atoms
.
x
]
:
make_
real4
(
0
)
);
d2
=
(
atoms
.
y
>
-
1
?
posq
[
atoms
.
y
]
:
make_
real4
(
0
)
);
d3
=
(
atoms
.
z
>
-
1
?
posq
[
atoms
.
z
]
:
make_
real4
(
0
)
);
#ifdef USE_EXCLUSIONS
exclusionIndices
=
exclusions
[
donorIndex
];
#endif
}
else
atoms =
(
int4
)
(-1, -1, -1, -1);
for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart +=
get_local_size(0)
) {
atoms
=
make_
int4
(
-
1
,
-
1
,
-
1
,
-
1
);
for
(
int
acceptorStart
=
0
;
acceptorStart
<
NUM_ACCEPTORS
;
acceptorStart
+=
LOCAL_SIZE
)
{
// Load the next block of acceptors into local memory.
barrier(CLK_LOCAL_MEM_FENCE)
;
int blockSize = min((int)
get_local_size(0)
, NUM_ACCEPTORS-acceptorStart);
if (
get_local_id(0)
< blockSize) {
int4 atoms2 = acceptorAtoms[acceptorStart+
get_local_id(0)
];
posBuffer[3*
get_local_id(0)
] = (atoms2.x > -1 ? posq[atoms2.x] :
(
real4
) 0
);
posBuffer[3*
get_local_id(0)
+1] = (atoms2.y > -1 ? posq[atoms2.y] :
(
real4
) 0
);
posBuffer[3*
get_local_id(0)
+2] = (atoms2.z > -1 ? posq[atoms2.z] :
(
real4
) 0
);
}
barrier(CLK_LOCAL_MEM_FENCE)
;
SYNC_THREADS
;
int
blockSize
=
min
((
int
)
LOCAL_SIZE
,
NUM_ACCEPTORS
-
acceptorStart
);
if
(
LOCAL_ID
<
blockSize
)
{
int4
atoms2
=
acceptorAtoms
[
acceptorStart
+
LOCAL_ID
];
posBuffer
[
3
*
LOCAL_ID
]
=
(
atoms2
.
x
>
-
1
?
posq
[
atoms2
.
x
]
:
make_
real4
(
0
)
);
posBuffer
[
3
*
LOCAL_ID
+
1
]
=
(
atoms2
.
y
>
-
1
?
posq
[
atoms2
.
y
]
:
make_
real4
(
0
)
);
posBuffer
[
3
*
LOCAL_ID
+
2
]
=
(
atoms2
.
z
>
-
1
?
posq
[
atoms2
.
z
]
:
make_
real4
(
0
)
);
}
SYNC_THREADS
;
if
(
donorIndex
<
NUM_DONORS
)
{
for
(
int
index
=
0
;
index
<
blockSize
;
index
++
)
{
int
acceptorIndex
=
acceptorStart
+
index
;
...
...
@@ -108,6 +114,26 @@ __kernel void computeDonorForces(__global real4* restrict forceBuffers, __global
// Write results
if
(
donorIndex
<
NUM_DONORS
)
{
#ifdef SUPPORTS_64_BIT_ATOMICS
if
(
atoms
.
x
>
-
1
)
{
ATOMIC_ADD
(
&
force
[
atoms
.
x
],
(
mm_ulong
)
((
mm_long
)
(
f1
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
force
[
atoms
.
x
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_long
)
(
f1
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
force
[
atoms
.
x
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_long
)
(
f1
.
z
*
0x100000000
)));
MEM_FENCE
;
}
if
(
atoms
.
y
>
-
1
)
{
ATOMIC_ADD
(
&
force
[
atoms
.
y
],
(
mm_ulong
)
((
mm_long
)
(
f2
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
force
[
atoms
.
y
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_long
)
(
f2
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
force
[
atoms
.
y
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_long
)
(
f2
.
z
*
0x100000000
)));
MEM_FENCE
;
}
if
(
atoms
.
z
>
-
1
)
{
ATOMIC_ADD
(
&
force
[
atoms
.
z
],
(
mm_ulong
)
((
mm_long
)
(
f3
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
force
[
atoms
.
z
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_long
)
(
f3
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
force
[
atoms
.
z
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_long
)
(
f3
.
z
*
0x100000000
)));
MEM_FENCE
;
}
#else
int4
bufferIndices
=
donorBufferIndices
[
donorIndex
];
if
(
atoms
.
x
>
-
1
)
{
unsigned
int
offset
=
atoms
.
x
+
bufferIndices
.
x
*
PADDED_NUM_ATOMS
;
...
...
@@ -127,49 +153,57 @@ __kernel void computeDonorForces(__global real4* restrict forceBuffers, __global
force
.
xyz
+=
f3
.
xyz
;
forceBuffers
[
offset
]
=
force
;
}
#endif
}
}
energyBuffer[
get_global_id(0)
] += energy;
energyBuffer
[
GLOBAL_ID
]
+=
energy
;
}
/**
* Compute forces on acceptors.
*/
__kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict exclusions,
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict acceptorBufferIndices, __local real4* restrict posBuffer, real4 periodicBoxSize, real4 invPeriodicBoxSize,
KERNEL
void
computeAcceptorForces
(
#ifdef SUPPORTS_64_BIT_ATOMICS
GLOBAL
mm_ulong
*
RESTRICT
force
,
#else
GLOBAL
real4
*
RESTRICT
forceBuffers
,
GLOBAL
const
int4
*
RESTRICT
acceptorBufferIndices
,
#endif
GLOBAL
mixed
*
RESTRICT
energyBuffer
,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
int4
*
RESTRICT
exclusions
,
GLOBAL
const
int4
*
RESTRICT
donorAtoms
,
GLOBAL
const
int4
*
RESTRICT
acceptorAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
PARAMETER_ARGUMENTS
)
{
real4 f1 = (real4) 0;
real4 f2 = (real4) 0;
real4 f3 = (real4) 0;
for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += get_global_size(0)) {
LOCAL
real4
posBuffer
[
3
*
THREAD_BLOCK_SIZE
];
real3
f1
=
make_real3
(
0
);
real3
f2
=
make_real3
(
0
);
real3
f3
=
make_real3
(
0
);
for
(
int
acceptorStart
=
0
;
acceptorStart
<
NUM_ACCEPTORS
;
acceptorStart
+=
GLOBAL_SIZE
)
{
// Load information about the acceptor this thread will compute forces on.
int acceptorIndex = acceptorStart+
get_global_id(0)
;
int
acceptorIndex
=
acceptorStart
+
GLOBAL_ID
;
int4
atoms
,
exclusionIndices
;
real4
a1
,
a2
,
a3
;
if
(
acceptorIndex
<
NUM_ACCEPTORS
)
{
atoms
=
acceptorAtoms
[
acceptorIndex
];
a1 = (atoms.x > -1 ? posq[atoms.x] :
(
real4
) 0
);
a2 = (atoms.y > -1 ? posq[atoms.y] :
(
real4
) 0
);
a3 = (atoms.z > -1 ? posq[atoms.z] :
(
real4
) 0
);
a1
=
(
atoms
.
x
>
-
1
?
posq
[
atoms
.
x
]
:
make_
real4
(
0
)
);
a2
=
(
atoms
.
y
>
-
1
?
posq
[
atoms
.
y
]
:
make_
real4
(
0
)
);
a3
=
(
atoms
.
z
>
-
1
?
posq
[
atoms
.
z
]
:
make_
real4
(
0
)
);
#ifdef USE_EXCLUSIONS
exclusionIndices
=
exclusions
[
acceptorIndex
];
#endif
}
else
atoms =
(
int4
)
(-1, -1, -1, -1);
for (int donorStart = 0; donorStart < NUM_DONORS; donorStart +=
get_local_size(0)
) {
atoms
=
make_
int4
(
-
1
,
-
1
,
-
1
,
-
1
);
for
(
int
donorStart
=
0
;
donorStart
<
NUM_DONORS
;
donorStart
+=
LOCAL_SIZE
)
{
// Load the next block of donors into local memory.
barrier(CLK_LOCAL_MEM_FENCE)
;
int blockSize = min((int)
get_local_size(0)
, NUM_DONORS-donorStart);
if (
get_local_id(0)
< blockSize) {
int4 atoms2 = donorAtoms[donorStart+
get_local_id(0)
];
posBuffer[3*
get_local_id(0)
] = (atoms2.x > -1 ? posq[atoms2.x] :
(
real4
) 0
);
posBuffer[3*
get_local_id(0)
+1] = (atoms2.y > -1 ? posq[atoms2.y] :
(
real4
) 0
);
posBuffer[3*
get_local_id(0)
+2] = (atoms2.z > -1 ? posq[atoms2.z] :
(
real4
) 0
);
}
barrier(CLK_LOCAL_MEM_FENCE)
;
SYNC_THREADS
;
int
blockSize
=
min
((
int
)
LOCAL_SIZE
,
NUM_DONORS
-
donorStart
);
if
(
LOCAL_ID
<
blockSize
)
{
int4
atoms2
=
donorAtoms
[
donorStart
+
LOCAL_ID
];
posBuffer
[
3
*
LOCAL_ID
]
=
(
atoms2
.
x
>
-
1
?
posq
[
atoms2
.
x
]
:
make_
real4
(
0
)
);
posBuffer
[
3
*
LOCAL_ID
+
1
]
=
(
atoms2
.
y
>
-
1
?
posq
[
atoms2
.
y
]
:
make_
real4
(
0
)
);
posBuffer
[
3
*
LOCAL_ID
+
2
]
=
(
atoms2
.
z
>
-
1
?
posq
[
atoms2
.
z
]
:
make_
real4
(
0
)
);
}
SYNC_THREADS
;
if
(
acceptorIndex
<
NUM_ACCEPTORS
)
{
for
(
int
index
=
0
;
index
<
blockSize
;
index
++
)
{
int
donorIndex
=
donorStart
+
index
;
...
...
@@ -197,6 +231,26 @@ __kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __glo
// Write results
if
(
acceptorIndex
<
NUM_ACCEPTORS
)
{
#ifdef SUPPORTS_64_BIT_ATOMICS
if
(
atoms
.
x
>
-
1
)
{
ATOMIC_ADD
(
&
force
[
atoms
.
x
],
(
mm_ulong
)
((
mm_long
)
(
f1
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
force
[
atoms
.
x
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_long
)
(
f1
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
force
[
atoms
.
x
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_long
)
(
f1
.
z
*
0x100000000
)));
MEM_FENCE
;
}
if
(
atoms
.
y
>
-
1
)
{
ATOMIC_ADD
(
&
force
[
atoms
.
y
],
(
mm_ulong
)
((
mm_long
)
(
f2
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
force
[
atoms
.
y
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_long
)
(
f2
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
force
[
atoms
.
y
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_long
)
(
f2
.
z
*
0x100000000
)));
MEM_FENCE
;
}
if
(
atoms
.
z
>
-
1
)
{
ATOMIC_ADD
(
&
force
[
atoms
.
z
],
(
mm_ulong
)
((
mm_long
)
(
f3
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
force
[
atoms
.
z
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_long
)
(
f3
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
force
[
atoms
.
z
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_long
)
(
f3
.
z
*
0x100000000
)));
MEM_FENCE
;
}
#else
int4
bufferIndices
=
acceptorBufferIndices
[
acceptorIndex
];
if
(
atoms
.
x
>
-
1
)
{
unsigned
int
offset
=
atoms
.
x
+
bufferIndices
.
x
*
PADDED_NUM_ATOMS
;
...
...
@@ -216,6 +270,7 @@ __kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __glo
force
.
xyz
+=
f3
.
xyz
;
forceBuffers
[
offset
]
=
force
;
}
#endif
}
}
}
platforms/c
uda
/src/kernels/customIntegrator.c
u
→
platforms/c
ommon
/src/kernels/customIntegrator.c
c
View file @
5a06df78
extern
"C"
__global__
void
computeFloatSum
(
const
float
*
__restrict__
sumBuffer
,
float
*
result
)
{
__shared__
float
tempBuffer
[
WORK_GROUP_SIZE
];
const
unsigned
int
thread
=
threadIdx
.
x
;
KERNEL
void
computeFloatSum
(
GLOBAL
const
float
*
RESTRICT
sumBuffer
,
GLOBAL
float
*
result
,
int
bufferSize
)
{
LOCAL
float
tempBuffer
[
WORK_GROUP_SIZE
];
const
unsigned
int
thread
=
LOCAL_ID
;
float
sum
=
0
;
for
(
unsigned
int
index
=
thread
;
index
<
SUM_BUFFER_SIZE
;
index
+=
blockDim
.
x
)
for
(
unsigned
int
index
=
thread
;
index
<
bufferSize
;
index
+=
LOCAL_SIZE
)
sum
+=
sumBuffer
[
index
];
tempBuffer
[
thread
]
=
sum
;
for
(
int
i
=
1
;
i
<
WORK_GROUP_SIZE
;
i
*=
2
)
{
__syncthreads
()
;
SYNC_THREADS
;
if
(
thread
%
(
i
*
2
)
==
0
&&
thread
+
i
<
WORK_GROUP_SIZE
)
tempBuffer
[
thread
]
+=
tempBuffer
[
thread
+
i
];
}
...
...
@@ -14,24 +14,26 @@ extern "C" __global__ void computeFloatSum(const float* __restrict__ sumBuffer,
*
result
=
tempBuffer
[
0
];
}
extern
"C"
__global__
void
computeDoubleSum
(
const
double
*
__restrict__
sumBuffer
,
double
*
result
)
{
__shared__
double
tempBuffer
[
WORK_GROUP_SIZE
];
const
unsigned
int
thread
=
threadIdx
.
x
;
#ifdef SUPPORTS_DOUBLE_PRECISION
KERNEL
void
computeDoubleSum
(
GLOBAL
const
double
*
RESTRICT
sumBuffer
,
GLOBAL
double
*
result
,
int
bufferSize
)
{
LOCAL
double
tempBuffer
[
WORK_GROUP_SIZE
];
const
unsigned
int
thread
=
LOCAL_ID
;
double
sum
=
0
;
for
(
unsigned
int
index
=
thread
;
index
<
SUM_BUFFER_SIZE
;
index
+=
blockDim
.
x
)
for
(
unsigned
int
index
=
thread
;
index
<
bufferSize
;
index
+=
LOCAL_SIZE
)
sum
+=
sumBuffer
[
index
];
tempBuffer
[
thread
]
=
sum
;
for
(
int
i
=
1
;
i
<
WORK_GROUP_SIZE
;
i
*=
2
)
{
__syncthreads
()
;
SYNC_THREADS
;
if
(
thread
%
(
i
*
2
)
==
0
&&
thread
+
i
<
WORK_GROUP_SIZE
)
tempBuffer
[
thread
]
+=
tempBuffer
[
thread
+
i
];
}
if
(
thread
==
0
)
*
result
=
tempBuffer
[
0
];
}
#endif
extern
"C"
__global__
void
applyPositionDeltas
(
real4
*
__restrict__
posq
,
real4
*
__restrict__
posqCorrection
,
mixed4
*
__restrict__
posDelta
)
{
for
(
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
KERNEL
void
applyPositionDeltas
(
GLOBAL
real4
*
RESTRICT
posq
,
GLOBAL
real4
*
RESTRICT
posqCorrection
,
GLOBAL
mixed4
*
RESTRICT
posDelta
)
{
for
(
unsigned
int
index
=
GLOBAL_ID
;
index
<
NUM_ATOMS
;
index
+=
GLOBAL_SIZE
)
{
#ifdef USE_MIXED_PRECISION
real4
pos1
=
posq
[
index
];
real4
pos2
=
posqCorrection
[
index
];
...
...
@@ -48,14 +50,14 @@ extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4*
#else
posq
[
index
]
=
pos
;
#endif
posDelta
[
index
]
=
make_mixed4
(
0
,
0
,
0
,
0
);
posDelta
[
index
]
=
make_mixed4
(
0
);
}
}
extern
"C"
__global__
void
generateRandomNumbers
(
int
numValues
,
float4
*
__restrict__
random
,
uint4
*
__restrict__
seed
)
{
uint4
state
=
seed
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
];
KERNEL
void
generateRandomNumbers
(
int
numValues
,
GLOBAL
float4
*
RESTRICT
random
,
GLOBAL
uint4
*
RESTRICT
seed
)
{
uint4
state
=
seed
[
GLOBAL_ID
];
unsigned
int
carry
=
0
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numValues
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
numValues
;
index
+=
GLOBAL_SIZE
)
{
// Generate three uniform random numbers.
state
.
x
=
state
.
x
*
69069
+
1
;
...
...
@@ -93,5 +95,5 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
random
[
index
]
=
make_float4
(
x1
,
x2
,
x3
,
0.0
f
);
}
seed
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
=
state
;
seed
[
GLOBAL_ID
]
=
state
;
}
platforms/common/src/kernels/customIntegratorPerDof.cc
0 → 100644
View file @
5a06df78
#ifdef SUPPORTS_DOUBLE_PRECISION
typedef
double
TempType
;
typedef
double3
TempType3
;
typedef
double4
TempType4
;
#define make_TempType3(a...) make_double3(a)
#define make_TempType4(a...) make_double4(a)
#define convertToTempType3(a) make_double3((a).x, (a).y, (a).z)
#define convertToTempType4(a) make_double4((a).x, (a).y, (a).z, (a).w)
inline
DEVICE
mixed4
convertFromDouble4
(
double4
a
)
{
return
make_mixed4
(
a
.
x
,
a
.
y
,
a
.
z
,
a
.
w
);
}
#else
typedef
float
TempType
;
typedef
float3
TempType3
;
typedef
float4
TempType4
;
#define make_TempType3(a...) make_float3(a)
#define make_TempType4(a...) make_float4(a)
#define convertToTempType3(a) make_float3((a).x, (a).y, (a).z)
#define convertToTempType4(a) make_float4((a).x, (a).y, (a).z, (a).w)
#endif
/**
* Load the position of a particle.
*/
inline
DEVICE
TempType4
loadPos
(
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
real4
*
RESTRICT
posqCorrection
,
int
index
)
{
#ifdef USE_MIXED_PRECISION
real4
pos1
=
posq
[
index
];
real4
pos2
=
posqCorrection
[
index
];
return
make_TempType4
(
pos1
.
x
+
(
mixed
)
pos2
.
x
,
pos1
.
y
+
(
mixed
)
pos2
.
y
,
pos1
.
z
+
(
mixed
)
pos2
.
z
,
pos1
.
w
);
#else
return
convertToTempType4
(
posq
[
index
]);
#endif
}
/**
* Store the position of a particle.
*/
inline
DEVICE
void
storePos
(
GLOBAL
real4
*
RESTRICT
posq
,
GLOBAL
real4
*
RESTRICT
posqCorrection
,
int
index
,
TempType4
pos
)
{
#ifdef USE_MIXED_PRECISION
posq
[
index
]
=
make_real4
((
real
)
pos
.
x
,
(
real
)
pos
.
y
,
(
real
)
pos
.
z
,
(
real
)
pos
.
w
);
posqCorrection
[
index
]
=
make_real4
(
pos
.
x
-
(
real
)
pos
.
x
,
pos
.
y
-
(
real
)
pos
.
y
,
pos
.
z
-
(
real
)
pos
.
z
,
0
);
#else
posq
[
index
]
=
make_real4
(
pos
.
x
,
pos
.
y
,
pos
.
z
,
pos
.
w
);
#endif
}
KERNEL
void
computePerDof
(
GLOBAL
real4
*
RESTRICT
posq
,
GLOBAL
real4
*
RESTRICT
posqCorrection
,
GLOBAL
mixed4
*
RESTRICT
posDelta
,
GLOBAL
mixed4
*
RESTRICT
velm
,
GLOBAL
const
mm_long
*
RESTRICT
force
,
GLOBAL
const
mixed2
*
RESTRICT
dt
,
GLOBAL
const
mixed
*
RESTRICT
globals
,
GLOBAL
mixed
*
RESTRICT
sum
,
GLOBAL
const
float4
*
RESTRICT
gaussianValues
,
unsigned
int
gaussianBaseIndex
,
GLOBAL
const
float4
*
RESTRICT
uniformValues
,
const
mixed
energy
,
GLOBAL
mixed
*
RESTRICT
energyParamDerivs
PARAMETER_ARGUMENTS
)
{
TempType3
stepSize
=
make_TempType3
(
dt
[
0
].
y
);
int
index
=
GLOBAL_ID
;
const
TempType
forceScale
=
((
TempType
)
1
)
/
0xFFFFFFFF
;
while
(
index
<
NUM_ATOMS
)
{
#ifdef LOAD_POS_AS_DELTA
TempType4
position
=
loadPos
(
posq
,
posqCorrection
,
index
)
+
convertToTempType4
(
posDelta
[
index
]);
#else
TempType4
position
=
loadPos
(
posq
,
posqCorrection
,
index
);
#endif
TempType4
velocity
=
convertToTempType4
(
velm
[
index
]);
TempType3
f
=
make_TempType3
(
forceScale
*
force
[
index
],
forceScale
*
force
[
index
+
PADDED_NUM_ATOMS
],
forceScale
*
force
[
index
+
PADDED_NUM_ATOMS
*
2
]);
TempType3
mass
=
make_TempType3
(
RECIP
(
velocity
.
w
));
if
(
velocity
.
w
!=
0.0
)
{
int
gaussianIndex
=
gaussianBaseIndex
;
int
uniformIndex
=
0
;
COMPUTE_STEP
}
index
+=
GLOBAL_SIZE
;
}
}
platforms/
opencl
/src/kernels/customManyParticle.c
l
→
platforms/
common
/src/kernels/customManyParticle.c
c
View file @
5a06df78
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
/**
* Record the force on an atom to global memory.
*/
inline
void
storeForce
(
int
atom,
real
4
force,
__global
long*
restrict
forceBuffers
)
{
atom_add
(
&forceBuffers[atom],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[atom+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[atom+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
inline
DEVICE
void
storeForce
(
int
atom
,
real
3
force
,
GLOBAL
mm_ulong
*
RESTRICT
forceBuffers
)
{
ATOMIC_ADD
(
&
forceBuffers
[
atom
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
x
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
atom
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
y
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
atom
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
z
*
0x100000000
))
)
;
}
/**
* Compute the difference between two vectors, taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude.
*/
inline
real4
delta
(
real
4
vec1,
real
4
vec2,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ
)
{
real4
result
=
(
real4
)
(
vec1.x-vec2.x,
vec1.y-vec2.y,
vec1.z-vec2.z,
0.0f
)
;
inline
DEVICE
real4
delta
(
real
3
vec1
,
real
3
vec2
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
)
{
real4
result
=
make_
real4
(
vec1
.
x
-
vec2
.
x
,
vec1
.
y
-
vec2
.
y
,
vec1
.
z
-
vec2
.
z
,
0.0
f
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
result
)
#endif
...
...
@@ -26,36 +23,36 @@ inline real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPerio
/**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/
real
computeAngle
(
real4
vec1,
real4
vec2
)
{
DEVICE
real
computeAngle
(
real4
vec1
,
real4
vec2
)
{
real
dotProduct
=
vec1
.
x
*
vec2
.
x
+
vec1
.
y
*
vec2
.
y
+
vec1
.
z
*
vec2
.
z
;
real
cosine
=
dotProduct
*
RSQRT
(
vec1
.
w
*
vec2
.
w
);
real
angle
;
if
(
cosine
>
0.99
f
||
cosine
<
-
0.99
f
)
{
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
real
4
crossProduct
=
cross
(
vec1,
vec2
)
;
real
3
crossProduct
=
trimTo3
(
cross
(
vec1
,
vec2
)
)
;
real
scale
=
vec1
.
w
*
vec2
.
w
;
angle
=
asin
(
SQRT
(
dot
(
crossProduct,
crossProduct
)
/scale
))
;
angle
=
ASIN
(
SQRT
(
dot
(
crossProduct
,
crossProduct
)
/
scale
));
if
(
cosine
<
0.0
f
)
angle
=
M_PI
-
angle
;
}
else
angle
=
acos
(
cosine
)
;
angle
=
ACOS
(
cosine
);
return
angle
;
}
/**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/
inline
real4
computeCross
(
real4
vec1,
real4
vec2
)
{
real
4
cp
=
cross
(
vec1,
vec2
)
;
return
(
real4
)
(
cp.x,
cp.y,
cp.z,
cp.x*cp.x+cp.y*cp.y+cp.z*cp.z
)
;
inline
DEVICE
real4
computeCross
(
real4
vec1
,
real4
vec2
)
{
real
3
cp
=
trimTo3
(
cross
(
vec1
,
vec2
)
)
;
return
make_
real4
(
cp
.
x
,
cp
.
y
,
cp
.
z
,
cp
.
x
*
cp
.
x
+
cp
.
y
*
cp
.
y
+
cp
.
z
*
cp
.
z
);
}
/**
* Determine whether a particular interaction is in the list of exclusions.
*/
inline
bool
isInteractionExcluded
(
int
atom1,
int
atom2,
__global
const
int*
restrict
exclusions,
__global
const
int*
restrict
exclusionStartIndex
)
{
inline
DEVICE
bool
isInteractionExcluded
(
int
atom1
,
int
atom2
,
GLOBAL
const
int
*
RESTRICT
exclusions
,
GLOBAL
const
int
*
RESTRICT
exclusionStartIndex
)
{
if
(
atom1
>
atom2
)
{
int
temp
=
atom1
;
atom1
=
atom2
;
...
...
@@ -76,24 +73,24 @@ inline bool isInteractionExcluded(int atom1, int atom2, __global const int* rest
/**
* Compute the interaction.
*/
__kernel
void
computeInteraction
(
__global
long*
restrict
forceBuffers,
__global
mixed*
restrict
energyBuffer,
__global
const
real4*
restrict
posq,
KERNEL
void
computeInteraction
(
GLOBAL
mm_ulong
*
RESTRICT
forceBuffers
,
GLOBAL
mixed
*
RESTRICT
energyBuffer
,
GLOBAL
const
real4
*
RESTRICT
posq
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
#ifdef USE_CUTOFF
,
__global
const
int*
restrict
neighbors,
__global
const
int*
restrict
neighborStartIndex
,
GLOBAL
const
int
*
RESTRICT
neighbors
,
GLOBAL
const
int
*
RESTRICT
neighborStartIndex
#endif
#ifdef USE_FILTERS
,
__global
int*
restrict
particleTypes,
__global
int*
restrict
orderIndex,
__global
int*
restrict
particleOrder
,
GLOBAL
int
*
RESTRICT
particleTypes
,
GLOBAL
int
*
RESTRICT
orderIndex
,
GLOBAL
int
*
RESTRICT
particleOrder
#endif
#ifdef USE_EXCLUSIONS
,
__global
int*
restrict
exclusions,
__global
int*
restrict
exclusionStartIndex
,
GLOBAL
int
*
RESTRICT
exclusions
,
GLOBAL
int
*
RESTRICT
exclusionStartIndex
#endif
PARAMETER_ARGUMENTS
)
{
mixed
energy
=
0
;
// Loop over particles to be the first one in the set.
for
(
int
p1
=
get_group_id
(
0
)
; p1 < NUM_ATOMS; p1 +=
get_num_groups(0)
) {
for
(
int
p1
=
GROUP_ID
;
p1
<
NUM_ATOMS
;
p1
+=
NUM_GROUPS
)
{
#ifdef USE_CENTRAL_PARTICLE
const
int
a1
=
p1
;
#else
...
...
@@ -110,7 +107,7 @@ __kernel void computeInteraction(
#endif
#endif
int
numCombinations
=
NUM_CANDIDATE_COMBINATIONS
;
for
(
int
index
=
get_local_id
(
0
)
; index < numCombinations; index +=
get_local_size(0)
) {
for
(
int
index
=
LOCAL_ID
;
index
<
numCombinations
;
index
+=
LOCAL_SIZE
)
{
FIND_ATOMS_FOR_COMBINATION_INDEX
;
bool
includeInteraction
=
IS_VALID_COMBINATION
;
#ifdef USE_CUTOFF
...
...
@@ -135,15 +132,15 @@ __kernel void computeInteraction(
}
}
}
energyBuffer[
get_global_id
(
0
)
]
+=
energy
;
energyBuffer
[
GLOBAL_ID
]
+=
energy
;
}
/**
* Find a bounding box for the atoms in each block.
*/
__kernel
void
findBlockBounds
(
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ,
__global
const
real4*
restrict
posq,
__global
real4*
restrict
blockCenter,
__global
real4*
restrict
blockBoundingBox,
__global
int*
restrict
numNeighborPairs
)
{
int
index
=
get_global_id
(
0
)
;
KERNEL
void
findBlockBounds
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
real4
*
RESTRICT
blockCenter
,
GLOBAL
real4
*
RESTRICT
blockBoundingBox
,
GLOBAL
int
*
RESTRICT
numNeighborPairs
)
{
int
index
=
GLOBAL_ID
;
int
base
=
index
*
TILE_SIZE
;
while
(
base
<
NUM_ATOMS
)
{
real4
pos
=
posq
[
base
];
...
...
@@ -159,37 +156,39 @@ __kernel void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, r
real4
center
=
0.5
f
*
(
maxPos
+
minPos
);
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos
,
center
)
#endif
minPos
=
(
real4
)
(
min
(
minPos.x,pos.x
)
,
min
(
minPos.y,pos.y
)
,
min
(
minPos.z,pos.z
)
,
0
)
;
maxPos
=
(
real4
)
(
max
(
maxPos.x,pos.x
)
,
max
(
maxPos.y,pos.y
)
,
max
(
maxPos.z,pos.z
)
,
0
)
;
minPos
=
make_
real4
(
min
(
minPos
.
x
,
pos
.
x
),
min
(
minPos
.
y
,
pos
.
y
),
min
(
minPos
.
z
,
pos
.
z
),
0
);
maxPos
=
make_
real4
(
max
(
maxPos
.
x
,
pos
.
x
),
max
(
maxPos
.
y
,
pos
.
y
),
max
(
maxPos
.
z
,
pos
.
z
),
0
);
}
real4
blockSize
=
0.5
f
*
(
maxPos
-
minPos
);
blockBoundingBox
[
index
]
=
blockSize
;
blockCenter
[
index
]
=
0.5
f
*
(
maxPos
+
minPos
);
index
+=
get_global_size
(
0
)
;
index
+=
GLOBAL_SIZE
;
base
=
index
*
TILE_SIZE
;
}
if
(
get_group_id
(
0
)
==
0
&&
get_local_id
(
0
)
==
0
)
if
(
GROUP_ID
==
0
&&
LOCAL_ID
==
0
)
*
numNeighborPairs
=
0
;
}
/**
* Find a list of neighbors for each atom.
*/
__kernel
void
findNeighbors
(
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ,
__global
const
real4*
restrict
posq,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockBoundingBox,
__global
int2*
restrict
neighborPairs,
__global
int*
restrict
numNeighborPairs,
__global
int*
restrict
numNeighborsForAtom,
int
maxNeighborPairs
KERNEL
void
findNeighbors
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
real4
*
RESTRICT
blockCenter
,
GLOBAL
const
real4
*
RESTRICT
blockBoundingBox
,
GLOBAL
int2
*
RESTRICT
neighborPairs
,
GLOBAL
int
*
RESTRICT
numNeighborPairs
,
GLOBAL
int
*
RESTRICT
numNeighborsForAtom
,
int
maxNeighborPairs
#ifdef USE_EXCLUSIONS
,
__global
const
int*
restrict
exclusions,
__global
const
int*
restrict
exclusionStartIndex
,
GLOBAL
const
int
*
RESTRICT
exclusions
,
GLOBAL
const
int
*
RESTRICT
exclusionStartIndex
#endif
)
{
__local
real4
positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE]
;
__local
bool
includeBlockFlags[FIND_NEIGHBORS_WORKGROUP_SIZE]
;
int
indexInWarp
=
get_local_id
(
0
)
%32
;
int
warpStart
=
get_local_id
(
0
)
-indexInWarp
;
for
(
int
atom1
=
get_global_id
(
0
)
; atom1 < PADDED_NUM_ATOMS; atom1 += get_global_size(0)) {
LOCAL
real3
positionCache
[
FIND_NEIGHBORS_WORKGROUP_SIZE
];
int
indexInWarp
=
LOCAL_ID
%
32
;
#ifndef __CUDA_ARCH__
LOCAL
bool
includeBlockFlags
[
FIND_NEIGHBORS_WORKGROUP_SIZE
];
int
warpStart
=
LOCAL_ID
-
indexInWarp
;
#endif
for
(
int
atom1
=
GLOBAL_ID
;
atom1
<
PADDED_NUM_ATOMS
;
atom1
+=
GLOBAL_SIZE
)
{
// Load data for this atom. Note that all threads in a warp are processing atoms from the same block.
real
4
pos1
=
posq[atom1]
;
real
3
pos1
=
trimTo3
(
posq
[
atom1
]
)
;
int
block1
=
atom1
/
TILE_SIZE
;
real4
blockCenter1
=
blockCenter
[
block1
];
real4
blockSize1
=
blockBoundingBox
[
block1
];
...
...
@@ -221,10 +220,18 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
// Loop over any blocks we identified as potentially containing neighbors.
includeBlockFlags[get_local_id
(
0
)
]
=
includeBlock2
;
#ifdef __CUDA_ARCH__
int
includeBlockFlags
=
BALLOT
(
includeBlock2
);
while
(
includeBlockFlags
!=
0
)
{
int
i
=
__ffs
(
includeBlockFlags
)
-
1
;
includeBlockFlags
&=
includeBlockFlags
-
1
;
{
#else
includeBlockFlags
[
LOCAL_ID
]
=
includeBlock2
;
SYNC_WARPS
;
for
(
int
i
=
0
;
i
<
TILE_SIZE
;
i
++
)
{
if
(
includeBlockFlags
[
warpStart
+
i
])
{
#endif
int
block2
=
block2Base
+
i
;
// Loop over atoms in this block.
...
...
@@ -233,12 +240,12 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
int
included
[
TILE_SIZE
];
int
numIncluded
=
0
;
SYNC_WARPS
;
positionCache[
get_local_id
(
0
)
]
=
posq[start+indexInWarp]
;
positionCache
[
LOCAL_ID
]
=
trimTo3
(
posq
[
start
+
indexInWarp
]
)
;
SYNC_WARPS
;
if
(
atom1
<
NUM_ATOMS
)
{
for
(
int
j
=
0
;
j
<
32
;
j
++
)
{
int
atom2
=
start
+
j
;
real
4
pos2
=
positionCache[
get_local_id
(
0
)
-indexInWarp+j]
;
real
3
pos2
=
positionCache
[
LOCAL_ID
-
indexInWarp
+
j
];
// Decide whether to include this atom pair in the neighbor list.
...
...
@@ -260,10 +267,10 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
// If we found any neighbors, store them to the neighbor list.
if
(
numIncluded
>
0
)
{
int
baseIndex
=
atom_add
(
numNeighborPairs,
numIncluded
)
;
int
baseIndex
=
ATOMIC_ADD
(
numNeighborPairs
,
numIncluded
);
if
(
baseIndex
+
numIncluded
<=
maxNeighborPairs
)
for
(
int
j
=
0
;
j
<
numIncluded
;
j
++
)
neighborPairs[baseIndex+j]
=
(
int2
)
(
atom1,
included[j]
)
;
neighborPairs
[
baseIndex
+
j
]
=
make_
int2
(
atom1
,
included
[
j
]);
totalNeighborsForAtom1
+=
numIncluded
;
}
}
...
...
@@ -279,59 +286,59 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
* Sum the neighbor counts to compute the start position of each atom. This kernel
* is executed as a single work group.
*/
__kernel
void
computeNeighborStartIndices
(
__global
int*
restrict
numNeighborsForAtom,
__global
int*
restrict
neighborStartIndex,
__global
int*
restrict
numNeighborPairs,
int
maxNeighborPairs
)
{
__local
unsigned
int
posBuffer[256]
;
KERNEL
void
computeNeighborStartIndices
(
GLOBAL
int
*
RESTRICT
numNeighborsForAtom
,
GLOBAL
int
*
RESTRICT
neighborStartIndex
,
GLOBAL
int
*
RESTRICT
numNeighborPairs
,
int
maxNeighborPairs
)
{
LOCAL
unsigned
int
posBuffer
[
256
];
if
(
*
numNeighborPairs
>
maxNeighborPairs
)
{
// There wasn't enough memory for the neighbor list, so we'll need to rebuild it. Set the neighbor start
// indices to indicate no neighbors for any atom.
for
(
int
i
=
get_local_id
(
0
)
; i <= NUM_ATOMS; i +=
get_local_size(0)
)
for
(
int
i
=
LOCAL_ID
;
i
<=
NUM_ATOMS
;
i
+=
LOCAL_SIZE
)
neighborStartIndex
[
i
]
=
0
;
return
;
}
unsigned
int
globalOffset
=
0
;
for
(
unsigned
int
startAtom
=
0
; startAtom < NUM_ATOMS; startAtom +=
get_local_size(0)
) {
for
(
unsigned
int
startAtom
=
0
;
startAtom
<
NUM_ATOMS
;
startAtom
+=
LOCAL_SIZE
)
{
// Load the neighbor counts into local memory.
unsigned
int
globalIndex
=
startAtom+
get_local_id
(
0
)
;
posBuffer[
get_local_id
(
0
)
]
=
(
globalIndex
<
NUM_ATOMS
?
numNeighborsForAtom[globalIndex]
:
0
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
unsigned
int
globalIndex
=
startAtom
+
LOCAL_ID
;
posBuffer
[
LOCAL_ID
]
=
(
globalIndex
<
NUM_ATOMS
?
numNeighborsForAtom
[
globalIndex
]
:
0
);
SYNC_THREADS
;
// Perform a parallel prefix sum.
for
(
unsigned
int
step
=
1
; step <
get_local_size(0)
; step *= 2) {
unsigned
int
add
=
(
get_local_id
(
0
)
>=
step
?
posBuffer[
get_local_id
(
0
)
-step]
:
0
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
posBuffer[
get_local_id
(
0
)
]
+=
add
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
unsigned
int
step
=
1
;
step
<
LOCAL_SIZE
;
step
*=
2
)
{
unsigned
int
add
=
(
LOCAL_ID
>=
step
?
posBuffer
[
LOCAL_ID
-
step
]
:
0
);
SYNC_THREADS
;
posBuffer
[
LOCAL_ID
]
+=
add
;
SYNC_THREADS
;
}
// Write the results back to global memory.
if
(
globalIndex
<
NUM_ATOMS
)
{
neighborStartIndex[globalIndex+1]
=
posBuffer[
get_local_id
(
0
)
]+globalOffset
;
neighborStartIndex
[
globalIndex
+
1
]
=
posBuffer
[
LOCAL_ID
]
+
globalOffset
;
numNeighborsForAtom
[
globalIndex
]
=
0
;
// Clear this so the next kernel can use it as a counter
}
globalOffset
+=
posBuffer[
get_local_size
(
0
)
-1]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
globalOffset
+=
posBuffer
[
LOCAL_SIZE
-
1
];
SYNC_THREADS
;
}
if
(
get_local_id
(
0
)
==
0
)
if
(
LOCAL_ID
==
0
)
neighborStartIndex
[
0
]
=
0
;
}
/**
* Assemble the final neighbor list.
*/
__kernel
void
copyPairsToNeighborList
(
__global
const
int2*
restrict
neighborPairs,
__global
int*
restrict
neighbors,
__global
int*
restrict
numNeighborPairs,
int
maxNeighborPairs,
__global
int*
restrict
numNeighborsForAtom,
__global
const
int*
restrict
neighborStartIndex
)
{
KERNEL
void
copyPairsToNeighborList
(
GLOBAL
const
int2
*
RESTRICT
neighborPairs
,
GLOBAL
int
*
RESTRICT
neighbors
,
GLOBAL
int
*
RESTRICT
numNeighborPairs
,
int
maxNeighborPairs
,
GLOBAL
int
*
RESTRICT
numNeighborsForAtom
,
GLOBAL
const
int
*
RESTRICT
neighborStartIndex
)
{
int
actualPairs
=
*
numNeighborPairs
;
if
(
actualPairs
>
maxNeighborPairs
)
return
;
// There wasn't enough memory for the neighbor list, so we'll need to rebuild it.
for
(
unsigned
int
index
=
get_global_id
(
0
)
; index < actualPairs; index +=
get_global_size(0)
) {
for
(
unsigned
int
index
=
GLOBAL_ID
;
index
<
actualPairs
;
index
+=
GLOBAL_SIZE
)
{
int2
pair
=
neighborPairs
[
index
];
int
startIndex
=
neighborStartIndex
[
pair
.
x
];
int
offset
=
atom_add
(
numNeighborsForAtom+pair.x,
1
)
;
int
offset
=
ATOMIC_ADD
(
numNeighborsForAtom
+
pair
.
x
,
1
);
neighbors
[
startIndex
+
offset
]
=
pair
.
y
;
}
}
Prev
1
2
3
4
5
6
7
8
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment