Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
5a06df78
Commit
5a06df78
authored
Mar 04, 2020
by
tic20
Browse files
Merge
https://github.com/openmm/openmm
parents
8dd60914
a9223eea
Changes
335
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
938 additions
and
644 deletions
+938
-644
platforms/common/src/kernels/customNonbonded.cc
platforms/common/src/kernels/customNonbonded.cc
+0
-0
platforms/common/src/kernels/customNonbondedGroups.cc
platforms/common/src/kernels/customNonbondedGroups.cc
+73
-65
platforms/common/src/kernels/gayBerne.cc
platforms/common/src/kernels/gayBerne.cc
+92
-92
platforms/common/src/kernels/gbsaObc.cc
platforms/common/src/kernels/gbsaObc.cc
+160
-156
platforms/common/src/kernels/gbsaObc2.cc
platforms/common/src/kernels/gbsaObc2.cc
+8
-3
platforms/common/src/kernels/gbsaObcReductions.cc
platforms/common/src/kernels/gbsaObcReductions.cc
+23
-21
platforms/common/src/kernels/gbsaObc_cpu.cc
platforms/common/src/kernels/gbsaObc_cpu.cc
+122
-118
platforms/common/src/kernels/harmonicAngleForce.cc
platforms/common/src/kernels/harmonicAngleForce.cc
+0
-0
platforms/common/src/kernels/harmonicBondForce.cc
platforms/common/src/kernels/harmonicBondForce.cc
+0
-0
platforms/common/src/kernels/integrationUtilities.cc
platforms/common/src/kernels/integrationUtilities.cc
+149
-88
platforms/common/src/kernels/langevin.cc
platforms/common/src/kernels/langevin.cc
+29
-27
platforms/common/src/kernels/langevinMiddle.cc
platforms/common/src/kernels/langevinMiddle.cc
+90
-0
platforms/common/src/kernels/periodicTorsionForce.cc
platforms/common/src/kernels/periodicTorsionForce.cc
+0
-0
platforms/common/src/kernels/rbTorsionForce.cc
platforms/common/src/kernels/rbTorsionForce.cc
+0
-0
platforms/common/src/kernels/removeCM.cc
platforms/common/src/kernels/removeCM.cc
+80
-0
platforms/common/src/kernels/rmsd.cc
platforms/common/src/kernels/rmsd.cc
+22
-22
platforms/common/src/kernels/torsionForce.cc
platforms/common/src/kernels/torsionForce.cc
+37
-0
platforms/common/src/kernels/verlet.cc
platforms/common/src/kernels/verlet.cc
+30
-21
platforms/cpu/include/CpuKernels.h
platforms/cpu/include/CpuKernels.h
+14
-18
platforms/cpu/include/CpuLangevinMiddleDynamics.h
platforms/cpu/include/CpuLangevinMiddleDynamics.h
+9
-13
No files found.
platforms/c
uda
/src/kernels/customNonbonded.c
u
→
platforms/c
ommon
/src/kernels/customNonbonded.c
c
View file @
5a06df78
File moved
platforms/
opencl
/src/kernels/customNonbondedGroups.c
l
→
platforms/
common
/src/kernels/customNonbondedGroups.c
c
View file @
5a06df78
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
typedef
struct
{
real
x
,
y
,
z
;
real
q
;
...
...
@@ -16,60 +12,69 @@ typedef struct {
* Find the maximum of a value across all threads in a warp, and return that to
* every thread.
*/
int
reduceMax
(
int
val,
__local
int*
temp
)
{
int
indexInWarp
=
get_local_id
(
0
)
%32
;
temp[get_local_id
(
0
)
]
=
val
;
DEVICE
int
reduceMax
(
int
val
,
LOCAL_ARG
int
*
temp
)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
// CUDA lets us do this slightly more efficiently by using shuffle operations.
for
(
int
mask
=
16
;
mask
>
0
;
mask
/=
2
)
val
=
max
(
val
,
__shfl_xor_sync
(
0xffffffff
,
val
,
mask
));
return
val
;
#else
int
indexInWarp
=
LOCAL_ID
%
32
;
temp
[
LOCAL_ID
]
=
val
;
SYNC_WARPS
;
for
(
int
offset
=
16
;
offset
>
0
;
offset
/=
2
)
{
if
(
offset
<
indexInWarp
)
temp[
get_local_id
(
0
)
]
=
max
(
temp[get_local_id
(
0
)
],
temp[get_local_id
(
0
)
+offset]
)
;
if
(
indexInWarp
<
offset
)
temp
[
LOCAL_ID
]
=
max
(
temp
[
LOCAL_ID
],
temp
[
LOCAL_ID
+
offset
]);
SYNC_WARPS
;
}
return
temp[get_local_id
(
0
)
-indexInWarp]
;
return
temp
[
LOCAL_ID
-
indexInWarp
];
#endif
}
#ifndef SUPPORTS_64_BIT_ATOMICS
/**
* This function is used on devices that don't support 64 bit atomics. Multiple threads within
* a single tile might have computed forces on the same atom. This loops over them and makes sure
* that only one thread updates the force on any given atom.
*/
void
writeForces
(
__global
real4*
forceBuffers,
__local
AtomData*
localData,
int
atomIndex
)
{
localData[
get_local_id
(
0
)
].x
=
atomIndex
;
void
writeForces
(
GLOBAL
real4
*
forceBuffers
,
LOCAL
AtomData
*
localData
,
int
atomIndex
)
{
localData
[
LOCAL_ID
].
x
=
atomIndex
;
SYNC_WARPS
;
real4
forceSum
=
(
real4
)
0
;
int
start
=
(
get_local_id
(
0
)
/TILE_SIZE
)
*TILE_SIZE
;
real4
forceSum
=
make_
real4
(
0
)
;
int
start
=
(
LOCAL_ID
/
TILE_SIZE
)
*
TILE_SIZE
;
int
end
=
start
+
32
;
bool
isFirst
=
true
;
for
(
int
i
=
start
;
i
<
end
;
i
++
)
if
(
localData
[
i
].
x
==
atomIndex
)
{
forceSum
+=
(
real4
)
(
localData
[
i
].
fx
,
localData
[
i
].
fy
,
localData
[
i
].
fz
,
0
);
isFirst
&=
(
i
>=
get_local_id
(
0
)
)
;
isFirst
&=
(
i
>=
LOCAL_ID
);
}
const
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
const
unsigned
int
warp
=
GLOBAL_ID
/
TILE_SIZE
;
unsigned
int
offset
=
atomIndex
+
warp
*
PADDED_NUM_ATOMS
;
if
(
isFirst
)
forceBuffers
[
offset
]
+=
forceSum
;
SYNC_WARPS
;
}
#endif
__kernel
void
computeInteractionGroups
(
KERNEL
void
computeInteractionGroups
(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
forceBuffers,
GLOBAL
mm_ulong
*
RESTRICT
forceBuffers
,
#else
__global
real4*
restrict
forceBuffers,
GLOBAL
real4
*
RESTRICT
forceBuffers
,
#endif
__global
mixed*
restrict
energyBuffer,
__global
const
real4*
restrict
posq,
__global
const
int4*
restrict
groupData,
__global
int*
restrict
numGroupTiles,
int
useNeighborList,
GLOBAL
mixed
*
RESTRICT
energyBuffer
,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
int4
*
RESTRICT
groupData
,
GLOBAL
const
int
*
RESTRICT
numGroupTiles
,
int
useNeighborList
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
get_global_size
(
0
)
/TILE_SIZE
;
const
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
; // global warpIndex
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
; // index within the warp
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
; // block warpIndex
const
unsigned
int
totalWarps
=
GLOBAL_SIZE
/
TILE_SIZE
;
const
unsigned
int
warp
=
GLOBAL_ID
/
TILE_SIZE
;
// global warpIndex
const
unsigned
int
tgx
=
LOCAL_ID
&
(
TILE_SIZE
-
1
);
// index within the warp
const
unsigned
int
tbx
=
LOCAL_ID
-
tgx
;
// block warpIndex
mixed
energy
=
0
;
INIT_DERIVATIVES
__local
AtomData
localData[LOCAL_MEMORY_SIZE]
;
__local
int
reductionBuffer[LOCAL_MEMORY_SIZE]
;
LOCAL
AtomData
localData
[
LOCAL_MEMORY_SIZE
];
LOCAL
int
reductionBuffer
[
LOCAL_MEMORY_SIZE
];
const
unsigned
int
startTile
=
(
useNeighborList
?
warp
*
numGroupTiles
[
0
]
/
totalWarps
:
FIRST_TILE
+
warp
*
(
LAST_TILE
-
FIRST_TILE
)
/
totalWarps
);
const
unsigned
int
endTile
=
(
useNeighborList
?
(
warp
+
1
)
*
numGroupTiles
[
0
]
/
totalWarps
:
FIRST_TILE
+
(
warp
+
1
)
*
(
LAST_TILE
-
FIRST_TILE
)
/
totalWarps
);
...
...
@@ -82,16 +87,16 @@ __kernel void computeInteractionGroups(
const
int
exclusions
=
atomData
.
w
;
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
real
4
force
=
(
real
4
)
(
0
)
;
real
3
force
=
make_
real
3
(
0
);
real4
posq2
=
posq
[
atom2
];
localData[
get_local_id
(
0
)
].x
=
posq2.x
;
localData[
get_local_id
(
0
)
].y
=
posq2.y
;
localData[
get_local_id
(
0
)
].z
=
posq2.z
;
localData[
get_local_id
(
0
)
].q
=
posq2.w
;
localData
[
LOCAL_ID
].
x
=
posq2
.
x
;
localData
[
LOCAL_ID
].
y
=
posq2
.
y
;
localData
[
LOCAL_ID
].
z
=
posq2
.
z
;
localData
[
LOCAL_ID
].
q
=
posq2
.
w
;
LOAD_LOCAL_PARAMETERS
localData[
get_local_id
(
0
)
].fx
=
0.0f
;
localData[
get_local_id
(
0
)
].fy
=
0.0f
;
localData[
get_local_id
(
0
)
].fz
=
0.0f
;
localData
[
LOCAL_ID
].
fx
=
0.0
f
;
localData
[
LOCAL_ID
].
fy
=
0.0
f
;
localData
[
LOCAL_ID
].
fz
=
0.0
f
;
int
tj
=
tgx
;
int
rangeStop
=
rangeStart
+
reduceMax
(
rangeEnd
-
rangeStart
,
reductionBuffer
);
SYNC_WARPS
;
...
...
@@ -99,8 +104,8 @@ __kernel void computeInteractionGroups(
if
(
j
<
rangeEnd
)
{
bool
isExcluded
=
(((
exclusions
>>
tj
)
&
1
)
==
0
);
int
localIndex
=
tbx
+
tj
;
posq2
=
(
real4
)
(
localData[localIndex].x,
localData[localIndex].y,
localData[localIndex].z,
localData[localIndex].q
)
;
real
4
delta
=
(
real
4
)
(
posq2.x
yz
-
posq1.xyz,
0
)
;
posq2
=
make_
real4
(
localData
[
localIndex
].
x
,
localData
[
localIndex
].
y
,
localData
[
localIndex
].
z
,
localData
[
localIndex
].
q
);
real
3
delta
=
make_
real
3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -117,35 +122,38 @@ __kernel void computeInteractionGroups(
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force.xyz
-=
delta.xyz
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
localIndex
].
fx
+=
delta
.
x
;
localData
[
localIndex
].
fy
+=
delta
.
y
;
localData
[
localIndex
].
fz
+=
delta
.
z
;
#ifdef USE_CUTOFF
}
#endif
tj
=
(
tj
==
rangeEnd
-
1
?
rangeStart
:
tj
+
1
);
}
tj
=
(
tj
==
rangeEnd-1
?
rangeStart
:
tj+1
)
;
SYNC_WARPS
;
}
#ifdef SUPPORTS_64_BIT_ATOMICS
if
(
exclusions
!=
0
)
{
atom_add
(
&forceBuffers[atom1],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
ATOMIC_ADD
(
&
forceBuffers
[
atom1
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
x
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
y
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
z
*
0x100000000
))
)
;
}
atom_add
(
&forceBuffers[atom2],
(
long
)
(
localData[get_local_id
(
0
)
].fx*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+PADDED_NUM_ATOMS],
(
long
)
(
localData[get_local_id
(
0
)
].fy*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+2*PADDED_NUM_ATOMS],
(
long
)
(
localData[get_local_id
(
0
)
].fz*0x100000000
))
;
ATOMIC_ADD
(
&
forceBuffers
[
atom2
],
(
mm_ulong
)
((
mm_long
)
(
localData
[
LOCAL_ID
].
fx
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_long
)
(
localData
[
LOCAL_ID
].
fy
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_long
)
(
localData
[
LOCAL_ID
].
fz
*
0x100000000
)));
SYNC_WARPS
;
#else
writeForces
(
forceBuffers
,
localData
,
atom2
);
localData[
get_local_id
(
0
)
].fx
=
force.x
;
localData[
get_local_id
(
0
)
].fy
=
force.y
;
localData[
get_local_id
(
0
)
].fz
=
force.z
;
localData
[
LOCAL_ID
].
fx
=
force
.
x
;
localData
[
LOCAL_ID
].
fy
=
force
.
y
;
localData
[
LOCAL_ID
].
fz
=
force
.
z
;
writeForces
(
forceBuffers
,
localData
,
atom1
);
#endif
}
energyBuffer[
get_global_id
(
0
)
]
+=
energy
;
energyBuffer
[
GLOBAL_ID
]
+=
energy
;
SAVE_DERIVATIVES
}
...
...
@@ -153,7 +161,7 @@ __kernel void computeInteractionGroups(
* If the neighbor list needs to be rebuilt, reset the number of tiles to 0. This is
* executed by a single thread.
*/
__kernel
void
prepareToBuildNeighborList
(
__global
int*
restrict
rebuildNeighborList,
__global
int*
restrict
numGroupTiles
)
{
KERNEL
void
prepareToBuildNeighborList
(
GLOBAL
int
*
RESTRICT
rebuildNeighborList
,
GLOBAL
int
*
RESTRICT
numGroupTiles
)
{
if
(
rebuildNeighborList
[
0
]
==
1
)
numGroupTiles
[
0
]
=
0
;
}
...
...
@@ -162,8 +170,8 @@ __kernel void prepareToBuildNeighborList(__global int* restrict rebuildNeighborL
* Filter the list of tiles to include only ones that have interactions within the
* padded cutoff.
*/
__kernel
void
buildNeighborList
(
__global
int*
restrict
rebuildNeighborList,
__global
int*
restrict
numGroupTiles,
__global
const
real4*
restrict
posq,
__global
const
int4*
restrict
groupData,
__global
int4*
restrict
filteredGroupData,
KERNEL
void
buildNeighborList
(
GLOBAL
int
*
RESTRICT
rebuildNeighborList
,
GLOBAL
int
*
RESTRICT
numGroupTiles
,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
int4
*
RESTRICT
groupData
,
GLOBAL
int4
*
RESTRICT
filteredGroupData
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
)
{
// If the neighbor list doesn't need to be rebuilt on this step, return immediately.
...
...
@@ -171,15 +179,15 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
if
(
rebuildNeighborList
[
0
]
==
0
)
return
;
const
unsigned
int
totalWarps
=
get_global_size
(
0
)
/TILE_SIZE
;
const
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
; // global warpIndex
const
unsigned
int
local_warp
=
get_local_id
(
0
)
/TILE_SIZE
; // local warpIndex
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
; // index within the warp
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
; // block warpIndex
__local
real4
localPos[LOCAL_MEMORY_SIZE]
;
__local
volatile
bool
anyInteraction[WARPS_IN_BLOCK]
;
__local
volatile
int
tileIndex[WARPS_IN_BLOCK]
;
__local
int
reductionBuffer[LOCAL_MEMORY_SIZE]
;
const
unsigned
int
totalWarps
=
GLOBAL_SIZE
/
TILE_SIZE
;
const
unsigned
int
warp
=
GLOBAL_ID
/
TILE_SIZE
;
// global warpIndex
const
unsigned
int
local_warp
=
LOCAL_ID
/
TILE_SIZE
;
// local warpIndex
const
unsigned
int
tgx
=
LOCAL_ID
&
(
TILE_SIZE
-
1
);
// index within the warp
const
unsigned
int
tbx
=
LOCAL_ID
-
tgx
;
// block warpIndex
LOCAL
real4
localPos
[
LOCAL_MEMORY_SIZE
];
LOCAL
volatile
bool
anyInteraction
[
WARPS_IN_BLOCK
];
LOCAL
volatile
int
tileIndex
[
WARPS_IN_BLOCK
];
LOCAL
int
reductionBuffer
[
LOCAL_MEMORY_SIZE
];
const
unsigned
int
startTile
=
warp
*
NUM_TILES
/
totalWarps
;
const
unsigned
int
endTile
=
(
warp
+
1
)
*
NUM_TILES
/
totalWarps
;
...
...
@@ -191,7 +199,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
const
int
rangeEnd
=
(
atomData
.
z
>>
16
)
&
0xFFFF
;
const
int
exclusions
=
atomData
.
w
;
real4
posq1
=
posq
[
atom1
];
localPos[
get_local_id
(
0
)
]
=
posq[atom2]
;
localPos
[
LOCAL_ID
]
=
posq
[
atom2
];
if
(
tgx
==
0
)
anyInteraction
[
local_warp
]
=
false
;
int
tj
=
tgx
;
...
...
@@ -199,10 +207,10 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
SYNC_WARPS
;
for
(
int
j
=
rangeStart
;
j
<
rangeStop
&&
!
anyInteraction
[
local_warp
];
j
++
)
{
SYNC_WARPS
;
if
(
j
<
rangeEnd
)
{
if
(
j
<
rangeEnd
&&
tj
<
rangeEnd
)
{
bool
isExcluded
=
(((
exclusions
>>
tj
)
&
1
)
==
0
);
int
localIndex
=
tbx
+
tj
;
real
4
delta
=
(
real
4
)
(
localPos[localIndex].x
yz
-
posq1.xyz,
0
)
;
real
3
delta
=
make_
real
3
(
localPos
[
localIndex
].
x
-
posq1
.
x
,
localPos
[
localIndex
].
y
-
posq1
.
y
,
localPos
[
localIndex
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -216,7 +224,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
if
(
anyInteraction
[
local_warp
])
{
SYNC_WARPS
;
if
(
tgx
==
0
)
tileIndex[local_warp]
=
atomic_add
(
numGroupTiles,
1
)
;
tileIndex
[
local_warp
]
=
ATOMIC_ADD
(
numGroupTiles
,
1
);
SYNC_WARPS
;
filteredGroupData
[
TILE_SIZE
*
tileIndex
[
local_warp
]
+
tgx
]
=
atomData
;
}
...
...
platforms/c
uda
/src/kernels/gayBerne.c
u
→
platforms/c
ommon
/src/kernels/gayBerne.c
c
View file @
5a06df78
...
...
@@ -4,10 +4,10 @@
/**
* Calculate the ellipsoid coordinate frames and associated matrices.
*/
extern
"C"
__global__
void
computeEllipsoidFrames
(
int
numParticles
,
const
real4
*
__restrict__
posq
,
int2
*
const
__restrict__
axisParticleIndices
,
const
float4
*
__restrict__
sigParams
,
const
float4
*
__restrict__
scale
,
real
*
__restrict__
aMatrix
,
real
*
__restrict__
bMatrix
,
real
*
__restrict__
gMatrix
,
const
int
*
sortedParticles
)
{
for
(
int
sortedIndex
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
sortedIndex
<
numParticles
;
sortedIndex
+=
blockDim
.
x
*
gridDim
.
x
)
{
KERNEL
void
computeEllipsoidFrames
(
int
numParticles
,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
int2
*
const
RESTRICT
axisParticleIndices
,
GLOBAL
const
float4
*
RESTRICT
sigParams
,
GLOBAL
const
float4
*
RESTRICT
scale
,
GLOBAL
real
*
RESTRICT
aMatrix
,
GLOBAL
real
*
RESTRICT
bMatrix
,
GLOBAL
real
*
RESTRICT
gMatrix
,
GLOBAL
const
int
*
sortedParticles
)
{
for
(
int
sortedIndex
=
GLOBAL_ID
;
sortedIndex
<
numParticles
;
sortedIndex
+=
GLOBAL_SIZE
)
{
// Compute the local coordinate system of the ellipsoid;
int
originalIndex
=
sortedParticles
[
sortedIndex
];
...
...
@@ -36,9 +36,9 @@ extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4*
// Compute matrices we will need later.
real
(
*
a
)[
3
]
=
(
real
(
*
)[
3
])
(
aMatrix
+
sortedIndex
*
9
);
real
(
*
b
)[
3
]
=
(
real
(
*
)[
3
])
(
bMatrix
+
sortedIndex
*
9
);
real
(
*
g
)[
3
]
=
(
real
(
*
)[
3
])
(
gMatrix
+
sortedIndex
*
9
);
GLOBAL
real
(
*
a
)[
3
]
=
(
GLOBAL
real
(
*
)[
3
])
(
aMatrix
+
sortedIndex
*
9
);
GLOBAL
real
(
*
b
)[
3
]
=
(
GLOBAL
real
(
*
)[
3
])
(
bMatrix
+
sortedIndex
*
9
);
GLOBAL
real
(
*
g
)[
3
]
=
(
GLOBAL
real
(
*
)[
3
])
(
gMatrix
+
sortedIndex
*
9
);
a
[
0
][
0
]
=
xdir
.
x
;
a
[
0
][
1
]
=
xdir
.
y
;
a
[
0
][
2
]
=
xdir
.
z
;
...
...
@@ -62,10 +62,10 @@ extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4*
/**
* Find a bounding box for the atoms in each block.
*/
extern
"C"
__global__
void
findBlockBounds
(
int
numAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
const
int
*
sortedAtoms
,
const
real4
*
__restrict__
posq
,
real4
*
__restrict__
sortedPos
,
real4
*
__restrict__
blockCenter
,
real4
*
__restrict__
blockBoundingBox
,
int
*
__restrict__
neighborBlockCount
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
KERNEL
void
findBlockBounds
(
int
numAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
GLOBAL
const
int
*
sortedAtoms
,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
real4
*
RESTRICT
sortedPos
,
GLOBAL
real4
*
RESTRICT
blockCenter
,
GLOBAL
real4
*
RESTRICT
blockBoundingBox
,
GLOBAL
int
*
RESTRICT
neighborBlockCount
)
{
int
index
=
GLOBAL_ID
;
int
base
=
index
*
TILE_SIZE
;
while
(
base
<
numAtoms
)
{
real4
pos
=
posq
[
sortedAtoms
[
base
]];
...
...
@@ -89,19 +89,19 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
real4
blockSize
=
0.5
f
*
(
maxPos
-
minPos
);
blockBoundingBox
[
index
]
=
blockSize
;
blockCenter
[
index
]
=
0.5
f
*
(
maxPos
+
minPos
);
index
+=
blockDim
.
x
*
gridDim
.
x
;
index
+=
GLOBAL_SIZE
;
base
=
index
*
TILE_SIZE
;
}
if
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
==
0
)
if
(
GLOBAL_ID
==
0
)
*
neighborBlockCount
=
0
;
}
/**
* This is called by findNeighbors() to write a block to the neighbor list.
*/
__device__
void
storeNeighbors
(
int
atom1
,
int
*
neighborBuffer
,
int
numAtomsInBuffer
,
int
maxNeighborBlocks
,
int
*
__restrict__
neighbors
,
int
*
__restrict__
neighborIndex
,
int
*
__restrict__
neighborBlockCount
)
{
int
blockIndex
=
atomicAdd
(
neighborBlockCount
,
1
);
DEVICE
void
storeNeighbors
(
int
atom1
,
int
*
neighborBuffer
,
int
numAtomsInBuffer
,
int
maxNeighborBlocks
,
GLOBAL
int
*
RESTRICT
neighbors
,
GLOBAL
int
*
RESTRICT
neighborIndex
,
GLOBAL
int
*
RESTRICT
neighborBlockCount
)
{
int
blockIndex
=
ATOMIC_ADD
(
neighborBlockCount
,
1
);
if
(
blockIndex
>=
maxNeighborBlocks
)
return
;
// We don't have enough room for the neighbor list.
neighborIndex
[
blockIndex
]
=
atom1
;
...
...
@@ -115,12 +115,12 @@ __device__ void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuf
/**
* Build a list of neighbors for each atom.
*/
extern
"C"
__global__
void
findNeighbors
(
int
numAtoms
,
int
maxNeighborBlocks
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real4
*
__restrict__
sortedPos
,
real4
*
__restrict__
blockCenter
,
real4
*
__restrict__
blockBoundingBox
,
int
*
__restrict__
neighbors
,
int
*
__restrict__
neighborIndex
,
int
*
__restrict__
neighborBlockCount
,
const
int
*
__restrict__
exclusions
,
const
int
*
__restrict__
exclusionStartIndex
)
{
KERNEL
void
findNeighbors
(
int
numAtoms
,
int
maxNeighborBlocks
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
GLOBAL
real4
*
RESTRICT
sortedPos
,
GLOBAL
real4
*
RESTRICT
blockCenter
,
GLOBAL
real4
*
RESTRICT
blockBoundingBox
,
GLOBAL
int
*
RESTRICT
neighbors
,
GLOBAL
int
*
RESTRICT
neighborIndex
,
GLOBAL
int
*
RESTRICT
neighborBlockCount
,
GLOBAL
const
int
*
RESTRICT
exclusions
,
GLOBAL
const
int
*
RESTRICT
exclusionStartIndex
)
{
const
int
numBlocks
=
(
numAtoms
+
TILE_SIZE
-
1
)
/
TILE_SIZE
;
int
neighborBuffer
[
NEIGHBOR_BLOCK_SIZE
];
for
(
int
atom1
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
atom1
<
numAtoms
;
atom1
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
atom1
=
GLOBAL_ID
;
atom1
<
numAtoms
;
atom1
+=
GLOBAL_SIZE
)
{
int
nextExclusion
=
exclusionStartIndex
[
atom1
];
int
lastExclusion
=
exclusionStartIndex
[
atom1
+
1
];
real4
pos
=
sortedPos
[
atom1
];
...
...
@@ -178,8 +178,8 @@ typedef struct {
real
a
[
3
][
3
],
b
[
3
][
3
],
g
[
3
][
3
];
}
AtomData
;
__device__
void
loadAtomData
(
AtomData
*
data
,
int
sortedIndex
,
int
originalIndex
,
const
real4
*
__restrict__
pos
,
const
float4
*
__restrict__
sigParams
,
const
float2
*
__restrict__
epsParams
,
const
real
*
__restrict__
aMatrix
,
const
real
*
__restrict__
bMatrix
,
const
real
*
__restrict__
gMatrix
)
{
DEVICE
void
loadAtomData
(
AtomData
*
data
,
int
sortedIndex
,
int
originalIndex
,
GLOBAL
const
real4
*
RESTRICT
pos
,
GLOBAL
const
float4
*
RESTRICT
sigParams
,
GLOBAL
const
float2
*
RESTRICT
epsParams
,
GLOBAL
const
real
*
RESTRICT
aMatrix
,
GLOBAL
const
real
*
RESTRICT
bMatrix
,
GLOBAL
const
real
*
RESTRICT
gMatrix
)
{
data
->
sig
=
sigParams
[
originalIndex
];
data
->
eps
=
epsParams
[
originalIndex
];
data
->
pos
=
trimTo3
(
pos
[
sortedIndex
]);
...
...
@@ -192,19 +192,19 @@ __device__ void loadAtomData(AtomData* data, int sortedIndex, int originalIndex,
}
}
inline
__device__
real3
matrixVectorProduct
(
real
(
*
m
)[
3
],
real3
v
)
{
inline
DEVICE
real3
matrixVectorProduct
(
real
(
*
m
)[
3
],
real3
v
)
{
return
make_real3
(
m
[
0
][
0
]
*
v
.
x
+
m
[
0
][
1
]
*
v
.
y
+
m
[
0
][
2
]
*
v
.
z
,
m
[
1
][
0
]
*
v
.
x
+
m
[
1
][
1
]
*
v
.
y
+
m
[
1
][
2
]
*
v
.
z
,
m
[
2
][
0
]
*
v
.
x
+
m
[
2
][
1
]
*
v
.
y
+
m
[
2
][
2
]
*
v
.
z
);
}
inline
__device__
real3
vectorMatrixProduct
(
real3
v
,
real
(
*
m
)[
3
])
{
inline
DEVICE
real3
vectorMatrixProduct
(
real3
v
,
real
(
*
m
)[
3
])
{
return
make_real3
(
m
[
0
][
0
]
*
v
.
x
+
m
[
1
][
0
]
*
v
.
y
+
m
[
2
][
0
]
*
v
.
z
,
m
[
0
][
1
]
*
v
.
x
+
m
[
1
][
1
]
*
v
.
y
+
m
[
2
][
1
]
*
v
.
z
,
m
[
0
][
2
]
*
v
.
x
+
m
[
1
][
2
]
*
v
.
y
+
m
[
2
][
2
]
*
v
.
z
);
}
inline
__device__
void
matrixSum
(
real
(
*
result
)[
3
],
real
(
*
a
)[
3
],
real
(
*
b
)[
3
])
{
inline
DEVICE
void
matrixSum
(
real
(
*
result
)[
3
],
real
(
*
a
)[
3
],
real
(
*
b
)[
3
])
{
result
[
0
][
0
]
=
a
[
0
][
0
]
+
b
[
0
][
0
];
result
[
0
][
1
]
=
a
[
0
][
1
]
+
b
[
0
][
1
];
result
[
0
][
2
]
=
a
[
0
][
2
]
+
b
[
0
][
2
];
...
...
@@ -216,12 +216,12 @@ inline __device__ void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3])
result
[
2
][
2
]
=
a
[
2
][
2
]
+
b
[
2
][
2
];
}
inline
__device__
real
determinant
(
real
(
*
m
)[
3
])
{
inline
DEVICE
real
determinant
(
real
(
*
m
)[
3
])
{
return
(
m
[
0
][
0
]
*
m
[
1
][
1
]
*
m
[
2
][
2
]
+
m
[
0
][
1
]
*
m
[
1
][
2
]
*
m
[
2
][
0
]
+
m
[
0
][
2
]
*
m
[
1
][
0
]
*
m
[
2
][
1
]
-
m
[
0
][
0
]
*
m
[
1
][
2
]
*
m
[
2
][
1
]
-
m
[
0
][
1
]
*
m
[
1
][
0
]
*
m
[
2
][
2
]
-
m
[
0
][
2
]
*
m
[
1
][
1
]
*
m
[
2
][
0
]);
}
inline
__device__
void
matrixInverse
(
real
(
*
result
)[
3
],
real
(
*
m
)[
3
])
{
inline
DEVICE
void
matrixInverse
(
real
(
*
result
)[
3
],
real
(
*
m
)[
3
])
{
real
invDet
=
RECIP
(
determinant
(
m
));
result
[
0
][
0
]
=
invDet
*
(
m
[
1
][
1
]
*
m
[
2
][
2
]
-
m
[
1
][
2
]
*
m
[
2
][
1
]);
result
[
1
][
0
]
=
-
invDet
*
(
m
[
1
][
0
]
*
m
[
2
][
2
]
-
m
[
1
][
2
]
*
m
[
2
][
0
]);
...
...
@@ -234,7 +234,7 @@ inline __device__ void matrixInverse(real (*result)[3], real (*m)[3]) {
result
[
2
][
2
]
=
invDet
*
(
m
[
0
][
0
]
*
m
[
1
][
1
]
-
m
[
0
][
1
]
*
m
[
1
][
0
]);
}
__device__
void
computeOneInteraction
(
AtomData
*
data1
,
AtomData
*
data2
,
real
sigma
,
real
epsilon
,
real3
dr
,
real
r2
,
real3
*
force1
,
real3
*
force2
,
real3
*
torque1
,
real3
*
torque2
,
mixed
*
totalEnergy
)
{
DEVICE
void
computeOneInteraction
(
AtomData
*
data1
,
AtomData
*
data2
,
real
sigma
,
real
epsilon
,
real3
dr
,
real
r2
,
real3
*
force1
,
real3
*
force2
,
real3
*
torque1
,
real3
*
torque2
,
mixed
*
totalEnergy
)
{
real
rInv
=
RSQRT
(
r2
);
real
r
=
r2
*
rInv
;
real3
drUnit
=
dr
*
rInv
;
...
...
@@ -335,25 +335,25 @@ __device__ void computeOneInteraction(AtomData* data1, AtomData* data2, real sig
/**
* Compute the interactions.
*/
extern
"C"
__global__
void
computeForce
(
unsigned
long
long
*
__restrict__
forceBuffers
,
unsigned
long
long
*
__restrict__
torqueBuffers
,
int
numAtoms
,
int
numExceptions
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
pos
,
const
float4
*
__restrict__
sigParams
,
const
float2
*
__restrict__
epsParams
,
const
int
*
__restrict__
sortedAtoms
,
const
real
*
__restrict__
aMatrix
,
const
real
*
__restrict__
bMatrix
,
const
real
*
__restrict__
gMatrix
,
const
int
*
__restrict__
exclusions
,
const
int
*
__restrict__
exclusionStartIndex
,
const
int4
*
__restrict__
exceptionParticles
,
const
float2
*
__restrict__
exceptionParams
KERNEL
void
computeForce
(
GLOBAL
mm_ulong
*
RESTRICT
forceBuffers
,
GLOBAL
mm_ulong
*
RESTRICT
torqueBuffers
,
int
numAtoms
,
int
numExceptions
,
GLOBAL
mixed
*
RESTRICT
energyBuffer
,
GLOBAL
const
real4
*
RESTRICT
pos
,
GLOBAL
const
float4
*
RESTRICT
sigParams
,
GLOBAL
const
float2
*
RESTRICT
epsParams
,
GLOBAL
const
int
*
RESTRICT
sortedAtoms
,
GLOBAL
const
real
*
RESTRICT
aMatrix
,
GLOBAL
const
real
*
RESTRICT
bMatrix
,
GLOBAL
const
real
*
RESTRICT
gMatrix
,
GLOBAL
const
int
*
RESTRICT
exclusions
,
GLOBAL
const
int
*
RESTRICT
exclusionStartIndex
,
GLOBAL
const
int4
*
RESTRICT
exceptionParticles
,
GLOBAL
const
float2
*
RESTRICT
exceptionParams
#ifdef USE_CUTOFF
,
int
maxNeighborBlocks
,
int
*
__restrict__
neighbors
,
int
*
__restrict__
neighborIndex
,
int
*
__restrict__
neighborBlockCount
,
,
int
maxNeighborBlocks
,
GLOBAL
int
*
RESTRICT
neighbors
,
GLOBAL
int
*
RESTRICT
neighborIndex
,
GLOBAL
int
*
RESTRICT
neighborBlockCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
#endif
)
{
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
GLOBAL_ID
/
TILE_SIZE
;
mixed
energy
=
0
;
#ifdef USE_CUTOFF
const
int
numBlocks
=
*
neighborBlockCount
;
if
(
numBlocks
>
maxNeighborBlocks
)
return
;
// There wasn't enough memory for the neighbor list.
for
(
int
block
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
block
<
numBlocks
;
block
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
block
=
GLOBAL_ID
;
block
<
numBlocks
;
block
+=
GLOBAL_SIZE
)
{
// Load parameters for atom1.
int
atom1
=
neighborIndex
[
block
];
...
...
@@ -384,22 +384,22 @@ extern "C" __global__ void computeForce(
real
sigma
=
data1
.
sig
.
x
+
data2
.
sig
.
x
;
real
epsilon
=
data1
.
eps
.
x
*
data2
.
eps
.
x
;
computeOneInteraction
(
&
data1
,
&
data2
,
sigma
,
epsilon
,
delta
,
r2
,
&
force1
,
&
force2
,
&
torque1
,
&
torque2
,
&
energy
);
atomicAdd
(
&
forceBuffers
[
index2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force2
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
index2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force2
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
index2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force2
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
index2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
torque2
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
index2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
torque2
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
index2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
torque2
.
z
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
index2
],
(
mm_ulong
)
((
mm_
long
)
(
force2
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
index2
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force2
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
index2
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force2
.
z
*
0x100000000
)));
ATOMIC_ADD
(
&
torqueBuffers
[
index2
],
(
mm_ulong
)
((
mm_
long
)
(
torque2
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
torqueBuffers
[
index2
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
torque2
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
torqueBuffers
[
index2
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
torque2
.
z
*
0x100000000
)));
}
atomicAdd
(
&
forceBuffers
[
index1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force1
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
index1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force1
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
index1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force1
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
index1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
torque1
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
index1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
torque1
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
index1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
torque1
.
z
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
index1
],
(
mm_ulong
)
((
mm_
long
)
(
force1
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
index1
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force1
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
index1
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force1
.
z
*
0x100000000
)));
ATOMIC_ADD
(
&
torqueBuffers
[
index1
],
(
mm_ulong
)
((
mm_
long
)
(
torque1
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
torqueBuffers
[
index1
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
torque1
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
torqueBuffers
[
index1
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
torque1
.
z
*
0x100000000
)));
}
#else
for
(
int
atom1
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
atom1
<
numAtoms
;
atom1
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
atom1
=
GLOBAL_ID
;
atom1
<
numAtoms
;
atom1
+=
GLOBAL_SIZE
)
{
// Load parameters for atom1.
int
index1
=
sortedAtoms
[
atom1
];
...
...
@@ -432,25 +432,25 @@ extern "C" __global__ void computeForce(
real
sigma
=
data1
.
sig
.
x
+
data2
.
sig
.
x
;
real
epsilon
=
data1
.
eps
.
x
*
data2
.
eps
.
x
;
computeOneInteraction
(
&
data1
,
&
data2
,
sigma
,
epsilon
,
delta
,
r2
,
&
force1
,
&
force2
,
&
torque1
,
&
torque2
,
&
energy
);
atomicAdd
(
&
forceBuffers
[
index2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force2
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
index2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force2
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
index2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force2
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
index2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
torque2
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
index2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
torque2
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
index2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
torque2
.
z
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
index2
],
(
mm_ulong
)
((
mm_
long
)
(
force2
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
index2
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force2
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
index2
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force2
.
z
*
0x100000000
)));
ATOMIC_ADD
(
&
torqueBuffers
[
index2
],
(
mm_ulong
)
((
mm_
long
)
(
torque2
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
torqueBuffers
[
index2
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
torque2
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
torqueBuffers
[
index2
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
torque2
.
z
*
0x100000000
)));
}
atomicAdd
(
&
forceBuffers
[
index1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force1
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
index1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force1
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
index1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force1
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
index1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
torque1
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
index1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
torque1
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
index1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
torque1
.
z
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
index1
],
(
mm_ulong
)
((
mm_
long
)
(
force1
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
index1
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force1
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
index1
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force1
.
z
*
0x100000000
)));
ATOMIC_ADD
(
&
torqueBuffers
[
index1
],
(
mm_ulong
)
((
mm_
long
)
(
torque1
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
torqueBuffers
[
index1
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
torque1
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
torqueBuffers
[
index1
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
torque1
.
z
*
0x100000000
)));
}
#endif
// Now compute exceptions.
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numExceptions
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
numExceptions
;
index
+=
GLOBAL_SIZE
)
{
int4
atomIndices
=
exceptionParticles
[
index
];
float2
params
=
exceptionParams
[
index
];
int
index1
=
atomIndices
.
x
,
index2
=
atomIndices
.
y
;
...
...
@@ -466,34 +466,34 @@ extern "C" __global__ void computeForce(
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
computeOneInteraction
(
&
data1
,
&
data2
,
params
.
x
,
params
.
y
,
delta
,
r2
,
&
force1
,
&
force2
,
&
torque1
,
&
torque2
,
&
energy
);
atomicAdd
(
&
forceBuffers
[
index1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force1
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
index1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force1
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
index1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force1
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
index2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force2
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
index2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force2
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
index2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force2
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
index1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
torque1
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
index1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
torque1
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
index1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
torque1
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
index2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
torque2
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
index2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
torque2
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
index2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
torque2
.
z
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
index1
],
(
mm_ulong
)
((
mm_
long
)
(
force1
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
index1
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force1
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
index1
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force1
.
z
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
index2
],
(
mm_ulong
)
((
mm_
long
)
(
force2
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
index2
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force2
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
index2
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force2
.
z
*
0x100000000
)));
ATOMIC_ADD
(
&
torqueBuffers
[
index1
],
(
mm_ulong
)
((
mm_
long
)
(
torque1
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
torqueBuffers
[
index1
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
torque1
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
torqueBuffers
[
index1
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
torque1
.
z
*
0x100000000
)));
ATOMIC_ADD
(
&
torqueBuffers
[
index2
],
(
mm_ulong
)
((
mm_
long
)
(
torque2
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
torqueBuffers
[
index2
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
torque2
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
torqueBuffers
[
index2
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
torque2
.
z
*
0x100000000
)));
#ifdef USE_CUTOFF
}
#endif
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
energyBuffer
[
GLOBAL_ID
]
+=
energy
;
}
/**
* Convert the torques to forces on the connected particles.
*/
extern
"C"
__global__
void
applyTorques
(
unsigned
long
long
*
__restrict__
forceBuffers
,
long
long
*
__restrict__
torqueBuffers
,
int
numParticles
,
const
real4
*
__restrict__
posq
,
int2
*
const
__restrict__
axisParticleIndices
,
const
int
*
sortedParticles
)
{
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
for
(
int
sortedIndex
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
sortedIndex
<
numParticles
;
sortedIndex
+=
blockDim
.
x
*
gridDim
.
x
)
{
KERNEL
void
applyTorques
(
GLOBAL
mm_ulong
*
RESTRICT
forceBuffers
,
GLOBAL
const
mm_long
*
RESTRICT
torqueBuffers
,
int
numParticles
,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
int2
*
const
RESTRICT
axisParticleIndices
,
GLOBAL
const
int
*
sortedParticles
)
{
const
unsigned
int
warp
=
GLOBAL_ID
/
TILE_SIZE
;
for
(
int
sortedIndex
=
GLOBAL_ID
;
sortedIndex
<
numParticles
;
sortedIndex
+=
GLOBAL_SIZE
)
{
int
originalIndex
=
sortedParticles
[
sortedIndex
];
real3
pos
=
trimTo3
(
posq
[
originalIndex
]);
int2
axisParticles
=
axisParticleIndices
[
originalIndex
];
...
...
@@ -522,16 +522,16 @@ extern "C" __global__ void applyTorques(
yforce
+=
f
;
force
-=
f
;
}
atomicAdd
(
&
forceBuffers
[
originalIndex
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
originalIndex
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
originalIndex
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
axisParticles
.
x
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
xforce
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
axisParticles
.
x
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
xforce
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
axisParticles
.
x
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
xforce
.
z
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
originalIndex
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
originalIndex
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
originalIndex
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
z
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
axisParticles
.
x
],
(
mm_ulong
)
((
mm_
long
)
(
xforce
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
axisParticles
.
x
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
xforce
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
axisParticles
.
x
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
xforce
.
z
*
0x100000000
)));
if
(
axisParticles
.
y
!=
-
1
)
{
atomicAdd
(
&
forceBuffers
[
axisParticles
.
y
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
yforce
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
axisParticles
.
y
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
yforce
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
axisParticles
.
y
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
yforce
.
z
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
axisParticles
.
y
],
(
mm_ulong
)
((
mm_
long
)
(
yforce
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
axisParticles
.
y
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
yforce
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
forceBuffers
[
axisParticles
.
y
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
yforce
.
z
*
0x100000000
)));
}
}
}
...
...
platforms/
opencl
/src/kernels/gbsaObc.c
l
→
platforms/
common
/src/kernels/gbsaObc.c
c
View file @
5a06df78
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
typedef
struct
{
...
...
@@ -13,26 +10,26 @@ typedef struct {
/**
* Compute the Born sum.
*/
__kernel
void
computeBornSum
(
KERNEL
void
computeBornSum
(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
global_bornSum,
GLOBAL
mm_ulong
*
RESTRICT
global_bornSum
,
#else
__global
real*
restrict
global_bornSum,
GLOBAL
real
*
RESTRICT
global_bornSum
,
#endif
__global
const
real4*
restrict
posq,
__global
const
real*
restrict
charge,
__global
const
float2*
restrict
global_params,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
real
*
RESTRICT
charge
,
GLOBAL
const
float2
*
RESTRICT
global_params
,
#ifdef USE_CUTOFF
__global
const
int*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockSize,
__global
const
int*
restrict
interactingAtoms,
GLOBAL
const
int
*
RESTRICT
tiles
,
GLOBAL
const
unsigned
int
*
RESTRICT
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
GLOBAL
const
real4
*
RESTRICT
blockCenter
,
GLOBAL
const
real4
*
RESTRICT
blockSize
,
GLOBAL
const
int
*
RESTRICT
interactingAtoms
,
#else
unsigned
int
numTiles
,
#endif
__global
const
ushort2*
exclusionTiles
)
{
const
unsigned
int
totalWarps
=
get_global_size
(
0
)
/TILE_SIZE
;
const
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
__local
AtomData1
localData[FORCE_WORK_GROUP_SIZE]
;
GLOBAL
const
ushort2
*
RESTRICT
exclusionTiles
)
{
const
unsigned
int
totalWarps
=
GLOBAL_SIZE
/
TILE_SIZE
;
const
unsigned
int
warp
=
GLOBAL_ID
/
TILE_SIZE
;
const
unsigned
int
tgx
=
LOCAL_ID
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
LOCAL_ID
-
tgx
;
LOCAL
AtomData1
localData
[
FORCE_WORK_GROUP_SIZE
];
// First loop: process tiles that contain exclusions.
...
...
@@ -42,7 +39,7 @@ __kernel void computeBornSum(
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real
bornSum
=
0
.0f
;
real
bornSum
=
0
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
real
charge1
=
charge
[
atom1
];
...
...
@@ -50,15 +47,15 @@ __kernel void computeBornSum(
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData[
get_local_id
(
0
)
].x
=
posq1.x
;
localData[
get_local_id
(
0
)
].y
=
posq1.y
;
localData[
get_local_id
(
0
)
].z
=
posq1.z
;
localData[
get_local_id
(
0
)
].q
=
charge1
;
localData[
get_local_id
(
0
)
].radius
=
params1.x
;
localData[
get_local_id
(
0
)
].scaledRadius
=
params1.y
;
localData
[
LOCAL_ID
].
x
=
posq1
.
x
;
localData
[
LOCAL_ID
].
y
=
posq1
.
y
;
localData
[
LOCAL_ID
].
z
=
posq1
.
z
;
localData
[
LOCAL_ID
].
q
=
charge1
;
localData
[
LOCAL_ID
].
radius
=
params1
.
x
;
localData
[
LOCAL_ID
].
scaledRadius
=
params1
.
y
;
SYNC_WARPS
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real
4
delta
=
(
real
4
)
(
localData[tbx+j].x-posq1.x,
localData[tbx+j].y-posq1.y,
localData[tbx+j].z-posq1.z
,
0
)
;
real
3
delta
=
make_
real
3
(
localData
[
tbx
+
j
].
x
-
posq1
.
x
,
localData
[
tbx
+
j
].
y
-
posq1
.
y
,
localData
[
tbx
+
j
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -70,7 +67,7 @@ __kernel void computeBornSum(
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
float2
params2
=
(
float2
)
(
localData[tbx+j].radius,
localData[tbx+j].scaledRadius
)
;
float2
params2
=
make_
float2
(
localData
[
tbx
+
j
].
radius
,
localData
[
tbx
+
j
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
((
j
!=
tgx
)
&&
(
params1
.
x
<
rScaledRadiusJ
))
{
real
l_ij
=
RECIP
(
max
((
real
)
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
...
...
@@ -91,21 +88,21 @@ __kernel void computeBornSum(
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq
[
j
];
localData[
get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[
get_local_id
(
0
)
].y
=
tempPosq.y
;
localData[
get_local_id
(
0
)
].z
=
tempPosq.z
;
localData[
get_local_id
(
0
)
].q
=
charge[j]
;
localData
[
LOCAL_ID
].
x
=
tempPosq
.
x
;
localData
[
LOCAL_ID
].
y
=
tempPosq
.
y
;
localData
[
LOCAL_ID
].
z
=
tempPosq
.
z
;
localData
[
LOCAL_ID
].
q
=
charge
[
j
];
float2
tempParams
=
global_params
[
j
];
localData[
get_local_id
(
0
)
].radius
=
tempParams.x
;
localData[
get_local_id
(
0
)
].scaledRadius
=
tempParams.y
;
localData[
get_local_id
(
0
)
].bornSum
=
0.0f
;
localData
[
LOCAL_ID
].
radius
=
tempParams
.
x
;
localData
[
LOCAL_ID
].
scaledRadius
=
tempParams
.
y
;
localData
[
LOCAL_ID
].
bornSum
=
0.0
f
;
SYNC_WARPS
;
// Compute the full set of interactions in this tile.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real
4
delta
=
(
real
4
)
(
localData[tbx+tj].x-posq1.x,
localData[tbx+tj].y-posq1.y,
localData[tbx+tj].z-posq1.z
,
0
)
;
real
3
delta
=
make_
real
3
(
localData
[
tbx
+
tj
].
x
-
posq1
.
x
,
localData
[
tbx
+
tj
].
y
-
posq1
.
y
,
localData
[
tbx
+
tj
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -117,7 +114,7 @@ __kernel void computeBornSum(
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
float2
params2
=
(
float2
)
(
localData[tbx+tj].radius,
localData[tbx+tj].scaledRadius
)
;
float2
params2
=
make_
float2
(
localData
[
tbx
+
tj
].
radius
,
localData
[
tbx
+
tj
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
(
params1
.
x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
((
real
)
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
...
...
@@ -151,17 +148,17 @@ __kernel void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atom_add
(
&global_bornSum[offset],
(
long
)
(
bornSum*0x100000000
))
;
ATOMIC_ADD
(
&
global_bornSum
[
offset
],
(
mm_ulong
)
((
mm_
long
)
(
bornSum
*
0x100000000
))
)
;
if
(
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
atom_add
(
&global_bornSum[offset],
(
long
)
(
localData[
get_local_id
(
0
)
].bornSum*0x100000000
))
;
ATOMIC_ADD
(
&
global_bornSum
[
offset
],
(
mm_ulong
)
((
mm_
long
)
(
localData
[
LOCAL_ID
].
bornSum
*
0x100000000
))
)
;
}
#else
unsigned
int
offset1
=
x
*
TILE_SIZE
+
tgx
+
warp
*
PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*
TILE_SIZE
+
tgx
+
warp
*
PADDED_NUM_ATOMS
;
global_bornSum
[
offset1
]
+=
bornSum
;
if
(
x
!=
y
)
global_bornSum[offset2]
+=
localData[
get_local_id
(
0
)
].bornSum
;
global_bornSum
[
offset2
]
+=
localData
[
LOCAL_ID
].
bornSum
;
#endif
}
...
...
@@ -172,17 +169,17 @@ __kernel void computeBornSum(
unsigned
int
numTiles
=
interactionCount
[
0
];
if
(
numTiles
>
maxTiles
)
return
;
// There wasn't enough memory for the neighbor list.
int
pos
=
(
int
)
(
warp*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
((
long
)
NUM_BLOCKS+1
)
/2
:
(
long
)
numTiles
)
/totalWarps
)
;
int
end
=
(
int
)
((
warp+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
((
long
)
NUM_BLOCKS+1
)
/2
:
(
long
)
numTiles
)
/totalWarps
)
;
int
pos
=
(
int
)
(
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
mm_
long
)
NUM_BLOCKS
+
1
)
/
2
:
(
mm_
long
)
numTiles
)
/
totalWarps
);
int
end
=
(
int
)
((
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
mm_
long
)
NUM_BLOCKS
+
1
)
/
2
:
(
mm_
long
)
numTiles
)
/
totalWarps
);
#else
int
pos
=
(
int
)
(
warp*
(
long
)
numTiles/totalWarps
)
;
int
end
=
(
int
)
((
warp+1
)
*
(
long
)
numTiles/totalWarps
)
;
int
pos
=
(
int
)
(
warp
*
(
mm_
long
)
numTiles
/
totalWarps
);
int
end
=
(
int
)
((
warp
+
1
)
*
(
mm_
long
)
numTiles
/
totalWarps
);
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__local
int
atomIndices[FORCE_WORK_GROUP_SIZE]
;
__local
volatile
int
skipTiles[FORCE_WORK_GROUP_SIZE]
;
skipTiles[
get_local_id
(
0
)
]
=
-1
;
LOCAL
int
atomIndices
[
FORCE_WORK_GROUP_SIZE
];
LOCAL
volatile
int
skipTiles
[
FORCE_WORK_GROUP_SIZE
];
skipTiles
[
LOCAL_ID
]
=
-
1
;
while
(
pos
<
end
)
{
real
bornSum
=
0
;
...
...
@@ -213,10 +210,10 @@ __kernel void computeBornSum(
SYNC_WARPS
;
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles[
get_local_id(0)
] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
skipTiles
[
LOCAL_ID
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles[
get_local_id(0)
] = end;
skipTiles
[
LOCAL_ID
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
SYNC_WARPS
;
...
...
@@ -238,17 +235,17 @@ __kernel void computeBornSum(
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomIndices[
get_local_id(0)
] = j;
atomIndices
[
LOCAL_ID
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
real4
tempPosq
=
posq
[
j
];
localData[
get_local_id(0)
].x = tempPosq.x;
localData[
get_local_id(0)
].y = tempPosq.y;
localData[
get_local_id(0)
].z = tempPosq.z;
localData[
get_local_id(0)
].q = charge[j];
localData
[
LOCAL_ID
].
x
=
tempPosq
.
x
;
localData
[
LOCAL_ID
].
y
=
tempPosq
.
y
;
localData
[
LOCAL_ID
].
z
=
tempPosq
.
z
;
localData
[
LOCAL_ID
].
q
=
charge
[
j
];
float2
tempParams
=
global_params
[
j
];
localData[
get_local_id(0)
].radius = tempParams.x;
localData[
get_local_id(0)
].scaledRadius = tempParams.y;
localData[
get_local_id(0)
].bornSum = 0.0f;
localData
[
LOCAL_ID
].
radius
=
tempParams
.
x
;
localData
[
LOCAL_ID
].
scaledRadius
=
tempParams
.
y
;
localData
[
LOCAL_ID
].
bornSum
=
0.0
f
;
}
SYNC_WARPS
;
#ifdef USE_PERIODIC
...
...
@@ -258,17 +255,17 @@ __kernel void computeBornSum(
real4
blockCenterX
=
blockCenter
[
x
];
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
posq1
,
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[
get_local_id(0)
], blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
localData
[
LOCAL_ID
],
blockCenterX
)
SYNC_WARPS
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real
4
delta =
(
real
4)
(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z
, 0
);
real
3
delta
=
make_
real
3
(
localData
[
tbx
+
tj
].
x
-
posq1
.
x
,
localData
[
tbx
+
tj
].
y
-
posq1
.
y
,
localData
[
tbx
+
tj
].
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
int
atom2
=
atomIndices
[
tbx
+
tj
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
float2 params2 =
(
float2
)
(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
float2
params2
=
make_
float2
(
localData
[
tbx
+
tj
].
radius
,
localData
[
tbx
+
tj
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
(
params1
.
x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
((
real
)
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
...
...
@@ -304,7 +301,7 @@ __kernel void computeBornSum(
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real
4
delta =
(
real
4)
(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z
, 0
);
real
3
delta
=
make_
real
3
(
localData
[
tbx
+
tj
].
x
-
posq1
.
x
,
localData
[
tbx
+
tj
].
y
-
posq1
.
y
,
localData
[
tbx
+
tj
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -317,7 +314,7 @@ __kernel void computeBornSum(
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
float2 params2 =
(
float2
)
(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
float2
params2
=
make_
float2
(
localData
[
tbx
+
tj
].
radius
,
localData
[
tbx
+
tj
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
(
params1
.
x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
((
real
)
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
...
...
@@ -350,20 +347,20 @@ __kernel void computeBornSum(
// Write results.
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[
get_local_id(0)
];
unsigned
int
atom2
=
atomIndices
[
LOCAL_ID
];
#else
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add
(&global_bornSum[atom1], (long) (bornSum*0x100000000));
ATOMIC_ADD
(
&
global_bornSum
[
atom1
],
(
mm_ulong
)
((
mm_
long
)
(
bornSum
*
0x100000000
))
)
;
if
(
atom2
<
PADDED_NUM_ATOMS
)
atom_add
(&global_bornSum[atom2], (long) (localData[
get_local_id(0)
].bornSum*0x100000000));
ATOMIC_ADD
(
&
global_bornSum
[
atom2
],
(
mm_ulong
)
((
mm_
long
)
(
localData
[
LOCAL_ID
].
bornSum
*
0x100000000
))
)
;
#else
unsigned
int
offset1
=
atom1
+
warp
*
PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
atom2
+
warp
*
PADDED_NUM_ATOMS
;
global_bornSum
[
offset1
]
+=
bornSum
;
if
(
atom2
<
PADDED_NUM_ATOMS
)
global_bornSum[offset2] += localData[
get_local_id(0)
].bornSum;
global_bornSum
[
offset2
]
+=
localData
[
LOCAL_ID
].
bornSum
;
#endif
}
pos
++
;
...
...
@@ -381,28 +378,28 @@ typedef struct {
* First part of computing the GBSA interaction.
*/
__kernel
void computeGBSAForce1(
KERNEL
void
computeGBSAForce1
(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict
global_bornForce,
GLOBAL
mm_ulong
*
RESTRICT
forceBuffers
,
GLOBAL
mm_ulong
*
RESTRICT
global_bornForce
,
#else
__global real4* restrict forceBuffers, __global real* restrict
global_bornForce,
GLOBAL
real4
*
RESTRICT
forceBuffers
,
GLOBAL
real
*
RESTRICT
global_bornForce
,
#endif
__global mixed* restrict
energyBuffer,
__global
const real4*
restrict posq, __global
const real*
restrict
charge,
__global
const real*
restrict
global_bornRadii, int needEnergy,
GLOBAL
mixed
*
RESTRICT
energyBuffer
,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
real
*
RESTRICT
charge
,
GLOBAL
const
real
*
RESTRICT
global_bornRadii
,
int
needEnergy
,
#ifdef USE_CUTOFF
__global
const int*
restrict tiles, __global
const unsigned int*
restrict
interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles,
__global
const real4*
restrict
blockCenter,
__global
const real4*
restrict
blockSize,
__global
const int*
restrict
interactingAtoms,
GLOBAL
const
int
*
RESTRICT
tiles
,
GLOBAL
const
unsigned
int
*
RESTRICT
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
GLOBAL
const
real4
*
RESTRICT
blockCenter
,
GLOBAL
const
real4
*
RESTRICT
blockSize
,
GLOBAL
const
int
*
RESTRICT
interactingAtoms
,
#else
unsigned
int
numTiles
,
#endif
__global
const ushort2* exclusionTiles) {
const unsigned int totalWarps =
get_global_size(0)
/TILE_SIZE;
const unsigned int warp =
get_global_id(0)
/TILE_SIZE;
const unsigned int tgx =
get_local_id(0)
& (TILE_SIZE-1);
const unsigned int tbx =
get_local_id(0)
- tgx;
GLOBAL
const
ushort2
*
RESTRICT
exclusionTiles
)
{
const
unsigned
int
totalWarps
=
GLOBAL_SIZE
/
TILE_SIZE
;
const
unsigned
int
warp
=
GLOBAL_ID
/
TILE_SIZE
;
const
unsigned
int
tgx
=
LOCAL_ID
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
LOCAL_ID
-
tgx
;
mixed
energy
=
0
;
__local
AtomData2 localData[FORCE_WORK_GROUP_SIZE];
LOCAL
AtomData2
localData
[
FORCE_WORK_GROUP_SIZE
];
// First loop: process tiles that contain exclusions.
...
...
@@ -412,7 +409,7 @@ __kernel void computeGBSAForce1(
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real4 force =
0.0f
;
real4
force
=
make_real4
(
0
)
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
real
charge1
=
charge
[
atom1
];
...
...
@@ -420,18 +417,17 @@ __kernel void computeGBSAForce1(
if
(
x
==
y
)
{
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = charge1;
localData[get_local_id(0)].bornRadius = bornRadius1;
localData
[
LOCAL_ID
].
x
=
posq1
.
x
;
localData
[
LOCAL_ID
].
y
=
posq1
.
y
;
localData
[
LOCAL_ID
].
z
=
posq1
.
z
;
localData
[
LOCAL_ID
].
q
=
charge1
;
localData
[
LOCAL_ID
].
bornRadius
=
bornRadius1
;
SYNC_WARPS
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
)
{
real3 pos2 =
(
real3
)
(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z);
real3
pos2
=
make_
real3
(
localData
[
tbx
+
j
].
x
,
localData
[
tbx
+
j
].
y
,
localData
[
tbx
+
j
].
z
);
real
charge2
=
localData
[
tbx
+
j
].
q
;
real
4
delta =
(
real
4)
(pos2
- posq1.xyz, 0
);
real
3
delta
=
make_
real
3
(
pos2
.
x
-
posq1
.
x
,
pos2
.
y
-
posq1
.
y
,
pos2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -459,8 +455,10 @@ __kernel void computeGBSAForce1(
#endif
if
(
needEnergy
)
energy
+=
0.5
f
*
tempEnergy
;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
#ifdef USE_CUTOFF
}
#endif
...
...
@@ -473,22 +471,22 @@ __kernel void computeGBSAForce1(
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq
[
j
];
localData[
get_local_id(0)
].x = tempPosq.x;
localData[
get_local_id(0)
].y = tempPosq.y;
localData[
get_local_id(0)
].z = tempPosq.z;
localData[
get_local_id(0)
].q = charge[j];
localData[
get_local_id(0)
].bornRadius = global_bornRadii[j];
localData[
get_local_id(0)
].fx = 0.0f;
localData[
get_local_id(0)
].fy = 0.0f;
localData[
get_local_id(0)
].fz = 0.0f;
localData[
get_local_id(0)
].fw = 0.0f;
localData
[
LOCAL_ID
].
x
=
tempPosq
.
x
;
localData
[
LOCAL_ID
].
y
=
tempPosq
.
y
;
localData
[
LOCAL_ID
].
z
=
tempPosq
.
z
;
localData
[
LOCAL_ID
].
q
=
charge
[
j
];
localData
[
LOCAL_ID
].
bornRadius
=
global_bornRadii
[
j
];
localData
[
LOCAL_ID
].
fx
=
0.0
f
;
localData
[
LOCAL_ID
].
fy
=
0.0
f
;
localData
[
LOCAL_ID
].
fz
=
0.0
f
;
localData
[
LOCAL_ID
].
fw
=
0.0
f
;
SYNC_WARPS
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
tj
<
NUM_ATOMS
)
{
real3 pos2 =
(
real3
)
(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
real3
pos2
=
make_
real3
(
localData
[
tbx
+
tj
].
x
,
localData
[
tbx
+
tj
].
y
,
localData
[
tbx
+
tj
].
z
);
real
charge2
=
localData
[
tbx
+
tj
].
q
;
real
4
delta =
(
real
4)
(pos2
- posq1.xyz, 0
);
real
3
delta
=
make_
real
3
(
pos2
.
x
-
posq1
.
x
,
pos2
.
y
-
posq1
.
y
,
pos2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -515,8 +513,10 @@ __kernel void computeGBSAForce1(
#endif
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
...
...
@@ -534,25 +534,25 @@ __kernel void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atom_add
(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add
(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add
(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
atom_add
(&global_bornForce[offset], (long) (force.w*0x100000000));
ATOMIC_ADD
(
&
forceBuffers
[
offset
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
x
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
y
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
z
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
global_bornForce
[
offset
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
w
*
0x100000000
))
)
;
if
(
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
atom_add
(&forceBuffers[offset], (long) (localData[
get_local_id(0)
].fx*0x100000000));
atom_add
(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[
get_local_id(0)
].fy*0x100000000));
atom_add
(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[
get_local_id(0)
].fz*0x100000000));
atom_add
(&global_bornForce[offset], (long) (localData[
get_local_id(0)
].fw*0x100000000));
ATOMIC_ADD
(
&
forceBuffers
[
offset
],
(
mm_ulong
)
((
mm_
long
)
(
localData
[
LOCAL_ID
].
fx
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
localData
[
LOCAL_ID
].
fy
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
localData
[
LOCAL_ID
].
fz
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
global_bornForce
[
offset
],
(
mm_ulong
)
((
mm_
long
)
(
localData
[
LOCAL_ID
].
fw
*
0x100000000
))
)
;
}
#else
unsigned
int
offset1
=
x
*
TILE_SIZE
+
tgx
+
warp
*
PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y
*
TILE_SIZE
+
tgx
+
warp
*
PADDED_NUM_ATOMS
;
forceBuffers[offset1]
.xyz += force.xyz
;
forceBuffers
[
offset1
]
+=
make_real4
(
force
.
x
,
force
.
y
,
force
.
z
,
0
)
;
global_bornForce
[
offset1
]
+=
force
.
w
;
if
(
x
!=
y
)
{
forceBuffers[offset2] += (real4) (localData[
get_local_id(0)
].fx, localData[
get_local_id(0)
].fy, localData[
get_local_id(0)
].fz, 0.0f);
global_bornForce[offset2] += localData[
get_local_id(0)
].fw;
forceBuffers
[
offset2
]
+=
(
real4
)
(
localData
[
LOCAL_ID
].
fx
,
localData
[
LOCAL_ID
].
fy
,
localData
[
LOCAL_ID
].
fz
,
0.0
f
);
global_bornForce
[
offset2
]
+=
localData
[
LOCAL_ID
].
fw
;
}
#endif
}
...
...
@@ -564,20 +564,20 @@ __kernel void computeGBSAForce1(
unsigned
int
numTiles
=
interactionCount
[
0
];
if
(
numTiles
>
maxTiles
)
return
;
// There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int
pos
=
(
int
)
(
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
mm_
long
)
NUM_BLOCKS
+
1
)
/
2
:
(
mm_
long
)
numTiles
)
/
totalWarps
);
int
end
=
(
int
)
((
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
mm_
long
)
NUM_BLOCKS
+
1
)
/
2
:
(
mm_
long
)
numTiles
)
/
totalWarps
);
#else
int pos = (int) (warp*(long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long)numTiles/totalWarps);
int
pos
=
(
int
)
(
warp
*
(
mm_
long
)
numTiles
/
totalWarps
);
int
end
=
(
int
)
((
warp
+
1
)
*
(
mm_
long
)
numTiles
/
totalWarps
);
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__local
int atomIndices[FORCE_WORK_GROUP_SIZE];
__local
volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[
get_local_id(0)
] = -1;
LOCAL
int
atomIndices
[
FORCE_WORK_GROUP_SIZE
];
LOCAL
volatile
int
skipTiles
[
FORCE_WORK_GROUP_SIZE
];
skipTiles
[
LOCAL_ID
]
=
-
1
;
while
(
pos
<
end
)
{
real4 force =
0
;
real4
force
=
make_real4
(
0
)
;
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
...
...
@@ -605,10 +605,10 @@ __kernel void computeGBSAForce1(
SYNC_WARPS
;
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles[
get_local_id
(
0
)
]
=
tile.x
+
tile.y*NUM_BLOCKS
-
tile.y*
(
tile.y+1
)
/2
;
skipTiles
[
LOCAL_ID
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles[
get_local_id
(
0
)
]
=
end
;
skipTiles
[
LOCAL_ID
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
SYNC_WARPS
;
...
...
@@ -630,18 +630,18 @@ __kernel void computeGBSAForce1(
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomIndices[
get_local_id
(
0
)
]
=
j
;
atomIndices
[
LOCAL_ID
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
real4
tempPosq
=
posq
[
j
];
localData[
get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[
get_local_id
(
0
)
].y
=
tempPosq.y
;
localData[
get_local_id
(
0
)
].z
=
tempPosq.z
;
localData[
get_local_id
(
0
)
].q
=
charge[j]
;
localData[
get_local_id
(
0
)
].bornRadius
=
global_bornRadii[j]
;
localData[
get_local_id
(
0
)
].fx
=
0.0f
;
localData[
get_local_id
(
0
)
].fy
=
0.0f
;
localData[
get_local_id
(
0
)
].fz
=
0.0f
;
localData[
get_local_id
(
0
)
].fw
=
0.0f
;
localData
[
LOCAL_ID
].
x
=
tempPosq
.
x
;
localData
[
LOCAL_ID
].
y
=
tempPosq
.
y
;
localData
[
LOCAL_ID
].
z
=
tempPosq
.
z
;
localData
[
LOCAL_ID
].
q
=
charge
[
j
];
localData
[
LOCAL_ID
].
bornRadius
=
global_bornRadii
[
j
];
localData
[
LOCAL_ID
].
fx
=
0.0
f
;
localData
[
LOCAL_ID
].
fy
=
0.0
f
;
localData
[
LOCAL_ID
].
fz
=
0.0
f
;
localData
[
LOCAL_ID
].
fw
=
0.0
f
;
}
SYNC_WARPS
;
#ifdef USE_PERIODIC
...
...
@@ -651,15 +651,15 @@ __kernel void computeGBSAForce1(
real4
blockCenterX
=
blockCenter
[
x
];
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
posq1
,
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
localData[
get_local_id
(
0
)
],
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
localData
[
LOCAL_ID
],
blockCenterX
)
SYNC_WARPS
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
atomIndices
[
tbx
+
tj
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
pos2
=
(
real3
)
(
localData[tbx+tj].x,
localData[tbx+tj].y,
localData[tbx+tj].z
)
;
real3
pos2
=
make_
real3
(
localData
[
tbx
+
tj
].
x
,
localData
[
tbx
+
tj
].
y
,
localData
[
tbx
+
tj
].
z
);
real
charge2
=
localData
[
tbx
+
tj
].
q
;
real
4
delta
=
(
real
4
)
(
pos2
-
posq1.xyz,
0
)
;
real
3
delta
=
make_
real
3
(
pos2
.
x
-
posq1
.
x
,
pos2
.
y
-
posq1
.
y
,
pos2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
...
...
@@ -681,8 +681,10 @@ __kernel void computeGBSAForce1(
#endif
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
...
...
@@ -702,9 +704,9 @@ __kernel void computeGBSAForce1(
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
atomIndices
[
tbx
+
tj
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
pos2
=
(
real3
)
(
localData[tbx+tj].x,
localData[tbx+tj].y,
localData[tbx+tj].z
)
;
real3
pos2
=
make_
real3
(
localData
[
tbx
+
tj
].
x
,
localData
[
tbx
+
tj
].
y
,
localData
[
tbx
+
tj
].
z
);
real
charge2
=
localData
[
tbx
+
tj
].
q
;
real
4
delta
=
(
real
4
)
(
pos2
-
posq1.xyz,
0
)
;
real
3
delta
=
make_
real
3
(
pos2
.
x
-
posq1
.
x
,
pos2
.
y
-
posq1
.
y
,
pos2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -731,8 +733,10 @@ __kernel void computeGBSAForce1(
#endif
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
...
...
@@ -745,37 +749,37 @@ __kernel void computeGBSAForce1(
SYNC_WARPS
;
}
}
// Write results.
#ifdef USE_CUTOFF
unsigned
int
atom2
=
atomIndices[
get_local_id
(
0
)
]
;
unsigned
int
atom2
=
atomIndices
[
LOCAL_ID
];
#else
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add
(
&forceBuffers[atom1],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
atom_add
(
&global_bornForce[atom1],
(
long
)
(
force.w*0x100000000
))
;
ATOMIC_ADD
(
&
forceBuffers
[
atom1
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
x
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
y
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
z
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
global_bornForce
[
atom1
],
(
mm_ulong
)
((
mm_
long
)
(
force
.
w
*
0x100000000
))
)
;
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
atom_add
(
&forceBuffers[atom2],
(
long
)
(
localData[
get_local_id
(
0
)
].fx*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+PADDED_NUM_ATOMS],
(
long
)
(
localData[
get_local_id
(
0
)
].fy*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+2*PADDED_NUM_ATOMS],
(
long
)
(
localData[
get_local_id
(
0
)
].fz*0x100000000
))
;
atom_add
(
&global_bornForce[atom2],
(
long
)
(
localData[
get_local_id
(
0
)
].fw*0x100000000
))
;
ATOMIC_ADD
(
&
forceBuffers
[
atom2
],
(
mm_ulong
)
((
mm_
long
)
(
localData
[
LOCAL_ID
].
fx
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
localData
[
LOCAL_ID
].
fy
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_
long
)
(
localData
[
LOCAL_ID
].
fz
*
0x100000000
))
)
;
ATOMIC_ADD
(
&
global_bornForce
[
atom2
],
(
mm_ulong
)
((
mm_
long
)
(
localData
[
LOCAL_ID
].
fw
*
0x100000000
))
)
;
}
#else
unsigned
int
offset1
=
atom1
+
warp
*
PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
atom2
+
warp
*
PADDED_NUM_ATOMS
;
forceBuffers[offset1]
.xyz
+=
force.xyz
;
forceBuffers
[
offset1
]
+=
make_real4
(
force
.
x
,
force
.
y
,
force
.
z
,
0
)
;
global_bornForce
[
offset1
]
+=
force
.
w
;
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
forceBuffers[offset2]
+=
(
real4
)
(
localData[
get_local_id
(
0
)
].fx,
localData[
get_local_id
(
0
)
].fy,
localData[
get_local_id
(
0
)
].fz,
0.0f
)
;
global_bornForce[offset2]
+=
localData[
get_local_id
(
0
)
].fw
;
forceBuffers
[
offset2
]
+=
(
real4
)
(
localData
[
LOCAL_ID
].
fx
,
localData
[
LOCAL_ID
].
fy
,
localData
[
LOCAL_ID
].
fz
,
0.0
f
);
global_bornForce
[
offset2
]
+=
localData
[
LOCAL_ID
].
fw
;
}
#endif
}
pos
++
;
}
energyBuffer[
get_global_id
(
0
)
]
+=
energy
;
energyBuffer
[
GLOBAL_ID
]
+=
energy
;
}
platforms/c
uda
/src/kernels/gbsaObc2.c
u
→
platforms/c
ommon
/src/kernels/gbsaObc2.c
c
View file @
5a06df78
...
...
@@ -2,8 +2,8 @@
real
invRSquaredOver4
=
0.25
f
*
invR
*
invR
;
real
rScaledRadiusJ
=
r
+
OBC_PARAMS2
.
y
;
real
rScaledRadiusI
=
r
+
OBC_PARAMS1
.
y
;
real
l_ijJ
=
RECIP
(
max
(
OBC_PARAMS1
.
x
,
fabs
(
r
-
OBC_PARAMS2
.
y
)));
real
l_ijI
=
RECIP
(
max
(
OBC_PARAMS2
.
x
,
fabs
(
r
-
OBC_PARAMS1
.
y
)));
real
l_ijJ
=
RECIP
(
max
(
(
real
)
OBC_PARAMS1
.
x
,
fabs
(
r
-
OBC_PARAMS2
.
y
)));
real
l_ijI
=
RECIP
(
max
(
(
real
)
OBC_PARAMS2
.
x
,
fabs
(
r
-
OBC_PARAMS1
.
y
)));
real
u_ijJ
=
RECIP
(
rScaledRadiusJ
);
real
u_ijI
=
RECIP
(
rScaledRadiusI
);
real
l_ij2J
=
l_ijJ
*
l_ijJ
;
...
...
@@ -16,12 +16,17 @@
real
t2I
=
(
l_ij2I
-
u_ij2I
);
real
term1
=
(
0.5
f
*
(
0.25
f
+
OBC_PARAMS2
.
y
*
OBC_PARAMS2
.
y
*
invRSquaredOver4
)
*
t2J
+
t1J
*
invRSquaredOver4
)
*
invR
;
real
term2
=
(
0.5
f
*
(
0.25
f
+
OBC_PARAMS1
.
y
*
OBC_PARAMS1
.
y
*
invRSquaredOver4
)
*
t2I
+
t1I
*
invRSquaredOver4
)
*
invR
;
#ifdef SUPPORTS_64_BIT_ATOMICS
real
tempdEdR
=
(
OBC_PARAMS1
.
x
<
rScaledRadiusJ
?
BORN_FORCE1
*
term1
/
0x100000000
:
0
);
tempdEdR
+=
(
OBC_PARAMS2
.
x
<
rScaledRadiusI
?
BORN_FORCE2
*
term2
/
0x100000000
:
0
);
#else
real
tempdEdR
=
(
OBC_PARAMS1
.
x
<
rScaledRadiusJ
?
BORN_FORCE1
*
term1
:
(
real
)
0
);
tempdEdR
+=
(
OBC_PARAMS2
.
x
<
rScaledRadiusI
?
BORN_FORCE2
*
term2
:
(
real
)
0
);
#endif
#ifdef USE_CUTOFF
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
&&
r2
<
CUTOFF_SQUARED
);
#else
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
);
#endif
dEdR
+=
(
includeInteraction
?
tempdEdR
:
0
);
dEdR
+=
(
includeInteraction
?
tempdEdR
:
(
real
)
0
);
}
platforms/
opencl
/src/kernels/gbsaObcReductions.c
l
→
platforms/
common
/src/kernels/gbsaObcReductions.c
c
View file @
5a06df78
...
...
@@ -5,22 +5,21 @@
* Reduce the Born sums to compute the Born radii.
*/
__kernel
void
reduceBornSum
(
int
bufferSize,
int
numBuffers,
float
alpha,
float
beta,
float
gamma,
KERNEL
void
reduceBornSum
(
float
alpha
,
float
beta
,
float
gamma
,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global
const
long*
restrict
bornSum,
GLOBAL
const
mm_
long
*
RESTRICT
bornSum
,
#else
__global
const
real*
restrict
bornSum
,
GLOBAL
const
real
*
RESTRICT
bornSum
,
int
bufferSize
,
int
numBuffers
,
#endif
__global
const
float2*
restrict
params,
__global
real*
restrict
bornRadii,
__global
real*
restrict
obcChain
)
{
unsigned
int
index
=
get_global_id
(
0
)
;
while
(
index
<
NUM_ATOMS
)
{
GLOBAL
const
float2
*
RESTRICT
params
,
GLOBAL
real
*
RESTRICT
bornRadii
,
GLOBAL
real
*
RESTRICT
obcChain
)
{
for
(
unsigned
int
index
=
GLOBAL_ID
;
index
<
NUM_ATOMS
;
index
+=
GLOBAL_SIZE
)
{
// Get summed Born data
int
totalSize
=
bufferSize*numBuffers
;
#ifdef SUPPORTS_64_BIT_ATOMICS
real
sum
=
(
1/
(
real
)
0x100000000
)
*bornSum[index]
;
real
sum
=
RECIP
(
(
real
)
0x100000000
)
*
bornSum
[
index
];
#else
real
sum
=
bornSum
[
index
];
int
totalSize
=
bufferSize
*
numBuffers
;
for
(
int
i
=
index
+
bufferSize
;
i
<
totalSize
;
i
+=
bufferSize
)
sum
+=
bornSum
[
i
];
#endif
...
...
@@ -33,12 +32,11 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
real
sum3
=
sum
*
sum2
;
real
tanhSum
=
tanh
(
alpha
*
sum
-
beta
*
sum2
+
gamma
*
sum3
);
real
nonOffsetRadius
=
offsetRadius
+
DIELECTRIC_OFFSET
;
real
radius
=
1/
(
1/
offsetRadius
-
tanhSum/nonOffsetRadius
)
;
real
radius
=
RECIP
(
RECIP
(
offsetRadius
)
-
tanhSum
/
nonOffsetRadius
);
real
chain
=
offsetRadius
*
(
alpha
-
2
*
beta
*
sum
+
3
*
gamma
*
sum2
);
chain
=
(
1
-
tanhSum
*
tanhSum
)
*
chain
/
nonOffsetRadius
;
bornRadii
[
index
]
=
radius
;
obcChain
[
index
]
=
chain
;
index
+=
get_global_size
(
0
)
;
}
}
...
...
@@ -46,21 +44,22 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
* Reduce the Born force.
*/
__kernel
void
reduceBornForce
(
int
bufferSize,
int
numBuffers,
__global
real*
bornForce,
KERNEL
void
reduceBornForce
(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global
const
long*
restrict
bornForceIn,
GLOBAL
mm_long
*
RESTRICT
bornForce
,
#else
GLOBAL
real
*
bornForce
,
int
bufferSize
,
int
numBuffers
,
#endif
__global
mixed*
restrict
energyBuffer,
__global
const
float2*
restrict
params,
__global
const
real*
restrict
bornRadii,
__global
const
real*
restrict
obcChain
)
{
GLOBAL
mixed
*
RESTRICT
energyBuffer
,
GLOBAL
const
float2
*
RESTRICT
params
,
GLOBAL
const
real
*
RESTRICT
bornRadii
,
GLOBAL
const
real
*
RESTRICT
obcChain
)
{
mixed
energy
=
0
;
unsigned
int
index
=
get_global_id
(
0
)
;
while
(
index
<
NUM_ATOMS
)
{
//
Sum
the
Born
force
for
(
unsigned
int
index
=
GLOBAL_ID
;
index
<
NUM_ATOMS
;
index
+=
GLOBAL_SIZE
)
{
// Get summed Born force
int
totalSize
=
bufferSize*numBuffers
;
#ifdef SUPPORTS_64_BIT_ATOMICS
real
force
=
(
1/
(
real
)
0x100000000
)
*bornForce
In
[index]
;
real
force
=
RECIP
(
(
real
)
0x100000000
)
*
bornForce
[
index
];
#else
real
force
=
bornForce
[
index
];
int
totalSize
=
bufferSize
*
numBuffers
;
for
(
int
i
=
index
+
bufferSize
;
i
<
totalSize
;
i
+=
bufferSize
)
force
+=
bornForce
[
i
];
#endif
...
...
@@ -69,13 +68,16 @@ __kernel void reduceBornForce(int bufferSize, int numBuffers, __global real* bor
float
offsetRadius
=
params
[
index
].
x
;
real
bornRadius
=
bornRadii
[
index
];
real
r
=
offsetRadius
+
DIELECTRIC_OFFSET
+
PROBE_RADIUS
;
real
ratio6
=
pow
((
offsetRadius+DIELECTRIC_OFFSET
)
/bornRadius,
(
real
)
6
)
;
real
ratio6
=
POW
((
offsetRadius
+
DIELECTRIC_OFFSET
)
/
bornRadius
,
(
real
)
6
);
real
saTerm
=
SURFACE_AREA_FACTOR
*
r
*
r
*
ratio6
;
force
+=
saTerm
/
bornRadius
;
energy
+=
saTerm
;
force
*=
bornRadius
*
bornRadius
*
obcChain
[
index
];
#ifdef SUPPORTS_64_BIT_ATOMICS
bornForce
[
index
]
=
(
mm_long
)
(
force
*
0x100000000
);
#else
bornForce
[
index
]
=
force
;
index
+=
get_global_size
(
0
)
;
#endif
}
energyBuffer[
get_global_id
(
0
)
]
+=
energy/-6
.0f
;
energyBuffer
[
GLOBAL_ID
]
+=
energy
/-
6
;
}
\ No newline at end of file
platforms/
opencl
/src/kernels/gbsaObc_cpu.c
l
→
platforms/
common
/src/kernels/gbsaObc_cpu.c
c
View file @
5a06df78
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
typedef
struct
{
real
x
,
y
,
z
;
real
q
;
...
...
@@ -12,27 +8,27 @@ typedef struct {
/**
* Compute the Born sum.
*/
__kernel
void
computeBornSum
(
KERNEL
void
computeBornSum
(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
global_bornSum,
GLOBAL
mm_long
*
RESTRICT
global_bornSum
,
#else
__global
real*
restrict
global_bornSum,
GLOBAL
real
*
RESTRICT
global_bornSum
,
#endif
__global
const
real4*
restrict
posq,
__global
const
real*
restrict
charge,
__global
const
float2*
restrict
global_params,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
real
*
RESTRICT
charge
,
GLOBAL
const
float2
*
RESTRICT
global_params
,
#ifdef USE_CUTOFF
__global
const
int*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
real4
periodicBoxVecX,
real4
periodicBoxVecY,
real4
periodicBoxVecZ,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockSize,
__global
const
int*
restrict
interactingAtoms,
GLOBAL
const
int
*
RESTRICT
tiles
,
GLOBAL
const
unsigned
int
*
RESTRICT
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
GLOBAL
const
real4
*
RESTRICT
blockCenter
,
GLOBAL
const
real4
*
RESTRICT
blockSize
,
GLOBAL
const
int
*
RESTRICT
interactingAtoms
,
#else
unsigned
int
numTiles
,
#endif
__global
const
ushort2*
exclusionTiles
)
{
__local
AtomData1
localData[TILE_SIZE]
;
GLOBAL
const
ushort2
*
exclusionTiles
)
{
LOCAL
AtomData1
localData
[
TILE_SIZE
];
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE+
get_group_id
(
0
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/
get_num_groups
(
0
)
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE+
(
get_group_id
(
0
)
+1
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/
get_num_groups
(
0
)
;
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
GROUP_ID
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
NUM_GROUPS
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
GROUP_ID
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
NUM_GROUPS
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
...
...
@@ -56,17 +52,17 @@ __kernel void computeBornSum(
for
(
unsigned
int
tgx
=
0
;
tgx
<
TILE_SIZE
;
tgx
++
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real
bornSum
=
0
.0f
;
real
bornSum
=
0
;
real4
posq1
=
posq
[
atom1
];
float2
params1
=
global_params
[
atom1
];
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
pos2
=
(
real3
)
(
localData[j].x,
localData[j].y,
localData[j].z
)
;
real3
pos2
=
make_
real3
(
localData
[
j
].
x
,
localData
[
j
].
y
,
localData
[
j
].
z
);
real
charge2
=
localData
[
j
].
q
;
real
4
delta
=
(
real
4
)
(
pos2
-
posq1.xyz,
0
)
;
real
3
delta
=
make_
real
3
(
pos2
.
x
-
posq1
.
x
,
pos2
.
y
-
posq1
.
y
,
pos2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
dot
(
delta.xyz,
delta
.xyz
)
;
real
r2
=
dot
(
trimTo3
(
delta
),
trimTo3
(
delta
)
);
#ifdef USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#else
...
...
@@ -74,7 +70,7 @@ __kernel void computeBornSum(
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
float2
params2
=
(
float2
)
(
localData[j].radius,
localData[j].scaledRadius
)
;
float2
params2
=
make_
float2
(
localData
[
j
].
radius
,
localData
[
j
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
((
j
!=
tgx
)
&&
(
params1
.
x
<
rScaledRadiusJ
))
{
real
l_ij
=
RECIP
(
max
((
real
)
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
...
...
@@ -92,9 +88,9 @@ __kernel void computeBornSum(
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add
(
&global_bornSum[atom1],
(
long
)
(
bornSum*0x100000000
))
;
ATOMIC_ADD
(
&
global_bornSum
[
atom1
],
(
mm_
long
)
(
bornSum
*
0x100000000
));
#else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
atom1
+
GROUP_ID
*
PADDED_NUM_ATOMS
;
global_bornSum
[
offset
]
+=
bornSum
;
#endif
}
...
...
@@ -110,9 +106,9 @@ __kernel void computeBornSum(
real4
posq1
=
posq
[
atom1
];
float2
params1
=
global_params
[
atom1
];
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
pos2
=
(
real3
)
(
localData[j].x,
localData[j].y,
localData[j].z
)
;
real3
pos2
=
make_
real3
(
localData
[
j
].
x
,
localData
[
j
].
y
,
localData
[
j
].
z
);
real
charge2
=
localData
[
j
].
q
;
real
4
delta
=
(
real
4
)
(
pos2
-
posq1.xyz,
0
)
;
real
3
delta
=
make_
real
3
(
pos2
.
x
-
posq1
.
x
,
pos2
.
y
-
posq1
.
y
,
pos2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -124,7 +120,7 @@ __kernel void computeBornSum(
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
float2
params2
=
(
float2
)
(
localData[j].radius,
localData[j].scaledRadius
)
;
float2
params2
=
make_
float2
(
localData
[
j
].
radius
,
localData
[
j
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
(
params1
.
x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
((
real
)
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
...
...
@@ -154,9 +150,9 @@ __kernel void computeBornSum(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add
(
&global_bornSum[atom1],
(
long
)
(
bornSum*0x100000000
))
;
ATOMIC_ADD
(
&
global_bornSum
[
atom1
],
(
mm_
long
)
(
bornSum
*
0x100000000
));
#else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
atom1
+
GROUP_ID
*
PADDED_NUM_ATOMS
;
global_bornSum
[
offset
]
+=
bornSum
;
#endif
}
...
...
@@ -166,9 +162,9 @@ __kernel void computeBornSum(
for
(
int
tgx
=
0
;
tgx
<
TILE_SIZE
;
tgx
++
)
{
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
atom_add
(
&global_bornSum[offset],
(
long
)
(
localData[tgx].bornSum*0x100000000
))
;
ATOMIC_ADD
(
&
global_bornSum
[
offset
],
(
mm_
long
)
(
localData
[
tgx
].
bornSum
*
0x100000000
));
#else
unsigned
int
offset
=
y*TILE_SIZE+tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
+
GROUP_ID
*
PADDED_NUM_ATOMS
;
global_bornSum
[
offset
]
+=
localData
[
tgx
].
bornSum
;
#endif
}
...
...
@@ -182,15 +178,15 @@ __kernel void computeBornSum(
unsigned
int
numTiles
=
interactionCount
[
0
];
if
(
numTiles
>
maxTiles
)
return
;
// There wasn't enough memory for the neighbor list.
int
pos
=
(
int
)
(
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
((
long
)
NUM_BLOCKS+1
)
/2
:
numTiles
)
/
get_num_groups
(
0
)
)
;
int
end
=
(
int
)
((
get_group_id
(
0
)
+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
((
long
)
NUM_BLOCKS+1
)
/2
:
numTiles
)
/
get_num_groups
(
0
)
)
;
int
pos
=
(
int
)
(
GROUP_ID
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
mm_
long
)
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
NUM_GROUPS
);
int
end
=
(
int
)
((
GROUP_ID
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
mm_
long
)
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
NUM_GROUPS
);
#else
int
pos
=
(
int
)
(
get_group_id
(
0
)
*
(
long
)
numTiles/get_num_groups
(
0
)
)
;
int
end
=
(
int
)
((
get_group_id
(
0
)
+1
)
*
(
long
)
numTiles/
get_num_groups
(
0
)
)
;
int
pos
=
(
int
)
(
GROUP_ID
*
(
mm_long
)
numTiles
/
NUM_GROUPS
);
int
end
=
(
int
)
((
GROUP_ID
+
1
)
*
(
mm_
long
)
numTiles
/
NUM_GROUPS
);
#endif
int
nextToSkip
=
-
1
;
int
currentSkipIndex
=
0
;
__local
int
atomIndices[TILE_SIZE]
;
LOCAL
int
atomIndices
[
TILE_SIZE
];
while
(
pos
<
end
)
{
bool
includeTile
=
true
;
...
...
@@ -263,15 +259,15 @@ __kernel void computeBornSum(
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
posq1
,
blockCenterX
)
float2
params1
=
global_params
[
atom1
];
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3 pos2 =
(
real3
)
(localData[j].x, localData[j].y, localData[j].z);
real3
pos2
=
make_
real3
(
localData
[
j
].
x
,
localData
[
j
].
y
,
localData
[
j
].
z
);
real
charge2
=
localData
[
j
].
q
;
real
4
delta =
(
real
4)
(pos2
- posq1.xyz, 0
);
real
3
delta
=
make_
real
3
(
pos2
.
x
-
posq1
.
x
,
pos2
.
y
-
posq1
.
y
,
pos2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
int
atom2
=
atomIndices
[
j
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
float2 params2 =
(
float2
)
(localData[j].radius, localData[j].scaledRadius);
float2
params2
=
make_
float2
(
localData
[
j
].
radius
,
localData
[
j
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
(
params1
.
x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
((
real
)
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
...
...
@@ -301,9 +297,9 @@ __kernel void computeBornSum(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add
(&global_bornSum[atom1], (long) (bornSum*0x100000000));
ATOMIC_ADD
(
&
global_bornSum
[
atom1
],
(
mm_
long
)
(
bornSum
*
0x100000000
));
#else
unsigned int offset = atom1 +
get_group_id(0)
*PADDED_NUM_ATOMS;
unsigned
int
offset
=
atom1
+
GROUP_ID
*
PADDED_NUM_ATOMS
;
global_bornSum
[
offset
]
+=
bornSum
;
#endif
}
...
...
@@ -319,9 +315,9 @@ __kernel void computeBornSum(
real4
posq1
=
posq
[
atom1
];
float2
params1
=
global_params
[
atom1
];
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3 pos2 =
(
real3
)
(localData[j].x, localData[j].y, localData[j].z);
real3
pos2
=
make_
real3
(
localData
[
j
].
x
,
localData
[
j
].
y
,
localData
[
j
].
z
);
real
charge2
=
localData
[
j
].
q
;
real
4
delta =
(
real
4)
(pos2
- posq1.xyz, 0
);
real
3
delta
=
make_
real
3
(
pos2
.
x
-
posq1
.
x
,
pos2
.
y
-
posq1
.
y
,
pos2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -334,7 +330,7 @@ __kernel void computeBornSum(
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
float2 params2 =
(
float2
)
(localData[j].radius, localData[j].scaledRadius);
float2
params2
=
make_
float2
(
localData
[
j
].
radius
,
localData
[
j
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
(
params1
.
x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
((
real
)
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
...
...
@@ -364,9 +360,9 @@ __kernel void computeBornSum(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add
(&global_bornSum[atom1], (long) (bornSum*0x100000000));
ATOMIC_ADD
(
&
global_bornSum
[
atom1
],
(
mm_
long
)
(
bornSum
*
0x100000000
));
#else
unsigned int offset = atom1 +
get_group_id(0)
*PADDED_NUM_ATOMS;
unsigned
int
offset
=
atom1
+
GROUP_ID
*
PADDED_NUM_ATOMS
;
global_bornSum
[
offset
]
+=
bornSum
;
#endif
}
...
...
@@ -382,9 +378,9 @@ __kernel void computeBornSum(
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add
(&global_bornSum[atom2], (long) (localData[tgx].bornSum*0x100000000));
ATOMIC_ADD
(
&
global_bornSum
[
atom2
],
(
mm_
long
)
(
localData
[
tgx
].
bornSum
*
0x100000000
));
#else
unsigned int offset = atom2 +
get_group_id(0)
*PADDED_NUM_ATOMS;
unsigned
int
offset
=
atom2
+
GROUP_ID
*
PADDED_NUM_ATOMS
;
global_bornSum
[
offset
]
+=
localData
[
tgx
].
bornSum
;
#endif
}
...
...
@@ -405,29 +401,29 @@ typedef struct {
* First part of computing the GBSA interaction.
*/
__kernel
void computeGBSAForce1(
KERNEL
void
computeGBSAForce1
(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict
global_bornForce,
GLOBAL
mm_long
*
RESTRICT
forceBuffers
,
GLOBAL
mm_long
*
RESTRICT
global_bornForce
,
#else
__global real4* restrict forceBuffers, __global real* restrict
global_bornForce,
GLOBAL
real4
*
RESTRICT
forceBuffers
,
GLOBAL
real
*
RESTRICT
global_bornForce
,
#endif
__global mixed* restrict
energyBuffer,
__global
const real4*
restrict posq, __global
const real*
restrict
charge,
__global
const real*
restrict
global_bornRadii, int needEnergy,
GLOBAL
mixed
*
RESTRICT
energyBuffer
,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
real
*
RESTRICT
charge
,
GLOBAL
const
real
*
RESTRICT
global_bornRadii
,
int
needEnergy
,
#ifdef USE_CUTOFF
__global
const int*
restrict tiles, __global
const unsigned int*
restrict
interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles,
__global
const real4*
restrict
blockCenter,
__global
const real4*
restrict
blockSize,
__global
const int*
restrict
interactingAtoms,
GLOBAL
const
int
*
RESTRICT
tiles
,
GLOBAL
const
unsigned
int
*
RESTRICT
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
GLOBAL
const
real4
*
RESTRICT
blockCenter
,
GLOBAL
const
real4
*
RESTRICT
blockSize
,
GLOBAL
const
int
*
RESTRICT
interactingAtoms
,
#else
unsigned
int
numTiles
,
#endif
__global
const ushort2* exclusionTiles) {
GLOBAL
const
ushort2
*
exclusionTiles
)
{
mixed
energy
=
0
;
__local
AtomData2 localData[TILE_SIZE];
LOCAL
AtomData2
localData
[
TILE_SIZE
];
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+
get_group_id(0)
*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/
get_num_groups(0)
;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(
get_group_id(0)
+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/
get_num_groups(0)
;
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
GROUP_ID
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
NUM_GROUPS
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
GROUP_ID
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
NUM_GROUPS
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
...
...
@@ -449,14 +445,14 @@ __kernel void computeGBSAForce1(
for
(
unsigned
int
tgx
=
0
;
tgx
<
TILE_SIZE
;
tgx
++
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4 force =
0
;
real4
force
=
make_real4
(
0
)
;
real4
posq1
=
posq
[
atom1
];
real
charge1
=
charge
[
atom1
];
real
bornRadius1
=
global_bornRadii
[
atom1
];
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3 pos2 =
(
real3
)
(localData[j].x, localData[j].y, localData[j].z);
real3
pos2
=
make_
real3
(
localData
[
j
].
x
,
localData
[
j
].
y
,
localData
[
j
].
z
);
real
charge2
=
localData
[
j
].
q
;
real
4
delta =
(
real
4)
(pos2
- posq1.xyz, 0
);
real
3
delta
=
make_
real
3
(
pos2
.
x
-
posq1
.
x
,
pos2
.
y
-
posq1
.
y
,
pos2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -485,21 +481,23 @@ __kernel void computeGBSAForce1(
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
#endif
energy
+=
0.5
f
*
tempEnergy
;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
}
}
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add
(&forceBuffers[atom1], (long) (force.x*0x100000000));
atom_add
(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add
(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
atom_add
(&global_bornForce[atom1], (long) (force.w*0x100000000));
ATOMIC_ADD
(
&
forceBuffers
[
atom1
],
(
mm_
long
)
(
force
.
x
*
0x100000000
));
ATOMIC_ADD
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
(
mm_
long
)
(
force
.
y
*
0x100000000
));
ATOMIC_ADD
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
(
mm_
long
)
(
force
.
z
*
0x100000000
));
ATOMIC_ADD
(
&
global_bornForce
[
atom1
],
(
mm_
long
)
(
force
.
w
*
0x100000000
));
#else
unsigned int offset = atom1 +
get_group_id(0)
*PADDED_NUM_ATOMS;
forceBuffers[offset]
.xyz = forceBuffers[offset].xyz+
force.
xyz
;
unsigned
int
offset
=
atom1
+
GROUP_ID
*
PADDED_NUM_ATOMS
;
forceBuffers
[
offset
]
+=
make_real4
(
force
.
x
,
force
.
y
,
force
.
z
,
0
)
;
global_bornForce
[
offset
]
+=
force
.
w
;
#endif
}
...
...
@@ -515,14 +513,14 @@ __kernel void computeGBSAForce1(
}
for
(
unsigned
int
tgx
=
0
;
tgx
<
TILE_SIZE
;
tgx
++
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4 force =
0
;
real4
force
=
make_real4
(
0
)
;
real4
posq1
=
posq
[
atom1
];
real
charge1
=
charge
[
atom1
];
real
bornRadius1
=
global_bornRadii
[
atom1
];
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3 pos2 =
(
real3
)
(localData[j].x, localData[j].y, localData[j].z);
real3
pos2
=
make_
real3
(
localData
[
j
].
x
,
localData
[
j
].
y
,
localData
[
j
].
z
);
real
charge2
=
localData
[
j
].
q
;
real
4
delta =
(
real
4)
(pos2
- posq1.xyz, 0
);
real
3
delta
=
make_
real
3
(
pos2
.
x
-
posq1
.
x
,
pos2
.
y
-
posq1
.
y
,
pos2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -550,8 +548,10 @@ __kernel void computeGBSAForce1(
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
#endif
energy
+=
tempEnergy
;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
j
].
fx
+=
delta
.
x
;
localData
[
j
].
fy
+=
delta
.
y
;
localData
[
j
].
fz
+=
delta
.
z
;
...
...
@@ -562,13 +562,13 @@ __kernel void computeGBSAForce1(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add
(&forceBuffers[atom1], (long) (force.x*0x100000000));
atom_add
(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add
(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
atom_add
(&global_bornForce[atom1], (long) (force.w*0x100000000));
ATOMIC_ADD
(
&
forceBuffers
[
atom1
],
(
mm_
long
)
(
force
.
x
*
0x100000000
));
ATOMIC_ADD
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
(
mm_
long
)
(
force
.
y
*
0x100000000
));
ATOMIC_ADD
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
(
mm_
long
)
(
force
.
z
*
0x100000000
));
ATOMIC_ADD
(
&
global_bornForce
[
atom1
],
(
mm_
long
)
(
force
.
w
*
0x100000000
));
#else
unsigned int offset = atom1 +
get_group_id(0)
*PADDED_NUM_ATOMS;
forceBuffers[offset]
.xyz = forceBuffers[offset].xyz+
force.
xyz
;
unsigned
int
offset
=
atom1
+
GROUP_ID
*
PADDED_NUM_ATOMS
;
forceBuffers
[
offset
]
+=
make_real4
(
force
.
x
,
force
.
y
,
force
.
z
,
0
)
;
global_bornForce
[
offset
]
+=
force
.
w
;
#endif
}
...
...
@@ -578,12 +578,12 @@ __kernel void computeGBSAForce1(
for
(
int
tgx
=
0
;
tgx
<
TILE_SIZE
;
tgx
++
)
{
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
atom_add
(&forceBuffers[offset], (long) (localData[tgx].fx*0x100000000));
atom_add
(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[tgx].fy*0x100000000));
atom_add
(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[tgx].fz*0x100000000));
atom_add
(&global_bornForce[offset], (long) (localData[tgx].fw*0x100000000));
ATOMIC_ADD
(
&
forceBuffers
[
offset
],
(
mm_
long
)
(
localData
[
tgx
].
fx
*
0x100000000
));
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
(
mm_
long
)
(
localData
[
tgx
].
fy
*
0x100000000
));
ATOMIC_ADD
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
(
mm_
long
)
(
localData
[
tgx
].
fz
*
0x100000000
));
ATOMIC_ADD
(
&
global_bornForce
[
offset
],
(
mm_
long
)
(
localData
[
tgx
].
fw
*
0x100000000
));
#else
unsigned int offset = y*TILE_SIZE+tgx +
get_group_id(0)
*PADDED_NUM_ATOMS;
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
+
GROUP_ID
*
PADDED_NUM_ATOMS
;
real4
f
=
forceBuffers
[
offset
];
f
.
x
+=
localData
[
tgx
].
fx
;
f
.
y
+=
localData
[
tgx
].
fy
;
...
...
@@ -602,15 +602,15 @@ __kernel void computeGBSAForce1(
unsigned
int
numTiles
=
interactionCount
[
0
];
if
(
numTiles
>
maxTiles
)
return
;
// There wasn't enough memory for the neighbor list.
int pos = (int) (
get_group_id(0)
*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/
get_num_groups(0)
);
int end = (int) ((
get_group_id(0)
+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/
get_num_groups(0)
);
int
pos
=
(
int
)
(
GROUP_ID
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
mm_
long
)
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
NUM_GROUPS
);
int
end
=
(
int
)
((
GROUP_ID
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
mm_
long
)
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
NUM_GROUPS
);
#else
int pos = (int) (
get_group_id(0)*(long)numTiles/get_num_groups(0)
);
int end = (int) ((
get_group_id(0)
+1)*(long)numTiles/
get_num_groups(0)
);
int
pos
=
(
int
)
(
GROUP_ID
*
(
mm_long
)
numTiles
/
NUM_GROUPS
);
int
end
=
(
int
)
((
GROUP_ID
+
1
)
*
(
mm_
long
)
numTiles
/
NUM_GROUPS
);
#endif
int
nextToSkip
=
-
1
;
int
currentSkipIndex
=
0
;
__local
int atomIndices[TILE_SIZE];
LOCAL
int
atomIndices
[
TILE_SIZE
];
while
(
pos
<
end
)
{
bool
includeTile
=
true
;
...
...
@@ -679,15 +679,15 @@ __kernel void computeGBSAForce1(
}
for
(
unsigned
int
tgx
=
0
;
tgx
<
TILE_SIZE
;
tgx
++
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
force
=
0
;
real4
force
=
make_real4
(
0
)
;
real4
posq1
=
posq
[
atom1
];
real
charge1
=
charge
[
atom1
];
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
posq1
,
blockCenterX
)
float
bornRadius1
=
global_bornRadii
[
atom1
];
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
pos2
=
(
real3
)
(
localData[j].x,
localData[j].y,
localData[j].z
)
;
real3
pos2
=
make_
real3
(
localData
[
j
].
x
,
localData
[
j
].
y
,
localData
[
j
].
z
);
real
charge2
=
localData
[
j
].
q
;
real
4
delta
=
(
real
4
)
(
pos2
-
posq1.xyz,
0
)
;
real
3
delta
=
make_
real
3
(
pos2
.
x
-
posq1
.
x
,
pos2
.
y
-
posq1
.
y
,
pos2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
int
atom2
=
atomIndices
[
j
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
...
...
@@ -709,8 +709,10 @@ __kernel void computeGBSAForce1(
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
#endif
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
j
].
fx
+=
delta
.
x
;
localData
[
j
].
fy
+=
delta
.
y
;
localData
[
j
].
fz
+=
delta
.
z
;
...
...
@@ -721,13 +723,13 @@ __kernel void computeGBSAForce1(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add
(
&forceBuffers[atom1],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
atom_add
(
&global_bornForce[atom1],
(
long
)
(
force.w*0x100000000
))
;
ATOMIC_ADD
(
&
forceBuffers
[
atom1
],
(
mm_
long
)
(
force
.
x
*
0x100000000
));
ATOMIC_ADD
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
(
mm_
long
)
(
force
.
y
*
0x100000000
));
ATOMIC_ADD
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
(
mm_
long
)
(
force
.
z
*
0x100000000
));
ATOMIC_ADD
(
&
global_bornForce
[
atom1
],
(
mm_
long
)
(
force
.
w
*
0x100000000
));
#else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset]
.xyz
=
forceBuffers[offset].xyz+
force.
xyz
;
unsigned
int
offset
=
atom1
+
GROUP_ID
*
PADDED_NUM_ATOMS
;
forceBuffers
[
offset
]
+=
make_real4
(
force
.
x
,
force
.
y
,
force
.
z
,
0
)
;
global_bornForce
[
offset
]
+=
force
.
w
;
#endif
}
...
...
@@ -739,14 +741,14 @@ __kernel void computeGBSAForce1(
for
(
unsigned
int
tgx
=
0
;
tgx
<
TILE_SIZE
;
tgx
++
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
force
=
0
;
real4
force
=
make_real4
(
0
)
;
real4
posq1
=
posq
[
atom1
];
real
charge1
=
charge
[
atom1
];
float
bornRadius1
=
global_bornRadii
[
atom1
];
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
pos2
=
(
real3
)
(
localData[j].x,
localData[j].y,
localData[j].z
)
;
real3
pos2
=
make_
real3
(
localData
[
j
].
x
,
localData
[
j
].
y
,
localData
[
j
].
z
);
real
charge2
=
localData
[
j
].
q
;
real
4
delta
=
(
real
4
)
(
pos2
-
posq1.xyz,
0
)
;
real
3
delta
=
make_
real
3
(
pos2
.
x
-
posq1
.
x
,
pos2
.
y
-
posq1
.
y
,
pos2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
...
...
@@ -775,8 +777,10 @@ __kernel void computeGBSAForce1(
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
#endif
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
j
].
fx
+=
delta
.
x
;
localData
[
j
].
fy
+=
delta
.
y
;
localData
[
j
].
fz
+=
delta
.
z
;
...
...
@@ -787,13 +791,13 @@ __kernel void computeGBSAForce1(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add
(
&forceBuffers[atom1],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
atom_add
(
&global_bornForce[atom1],
(
long
)
(
force.w*0x100000000
))
;
ATOMIC_ADD
(
&
forceBuffers
[
atom1
],
(
mm_
long
)
(
force
.
x
*
0x100000000
));
ATOMIC_ADD
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
(
mm_
long
)
(
force
.
y
*
0x100000000
));
ATOMIC_ADD
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
(
mm_
long
)
(
force
.
z
*
0x100000000
));
ATOMIC_ADD
(
&
global_bornForce
[
atom1
],
(
mm_
long
)
(
force
.
w
*
0x100000000
));
#else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset]
.xyz
=
forceBuffers[offset].xyz+
force.
xyz
;
unsigned
int
offset
=
atom1
+
GROUP_ID
*
PADDED_NUM_ATOMS
;
forceBuffers
[
offset
]
+=
make_real4
(
force
.
x
,
force
.
y
,
force
.
z
,
0
)
;
global_bornForce
[
offset
]
+=
force
.
w
;
#endif
}
...
...
@@ -809,12 +813,12 @@ __kernel void computeGBSAForce1(
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add
(
&forceBuffers[atom2],
(
long
)
(
localData[tgx].fx*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+PADDED_NUM_ATOMS],
(
long
)
(
localData[tgx].fy*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+2*PADDED_NUM_ATOMS],
(
long
)
(
localData[tgx].fz*0x100000000
))
;
atom_add
(
&global_bornForce[atom2],
(
long
)
(
localData[tgx].fw*0x100000000
))
;
ATOMIC_ADD
(
&
forceBuffers
[
atom2
],
(
mm_
long
)
(
localData
[
tgx
].
fx
*
0x100000000
));
ATOMIC_ADD
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
(
mm_
long
)
(
localData
[
tgx
].
fy
*
0x100000000
));
ATOMIC_ADD
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
(
mm_
long
)
(
localData
[
tgx
].
fz
*
0x100000000
));
ATOMIC_ADD
(
&
global_bornForce
[
atom2
],
(
mm_
long
)
(
localData
[
tgx
].
fw
*
0x100000000
));
#else
unsigned
int
offset
=
atom2
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
atom2
+
GROUP_ID
*
PADDED_NUM_ATOMS
;
real4
f
=
forceBuffers
[
offset
];
f
.
x
+=
localData
[
tgx
].
fx
;
f
.
y
+=
localData
[
tgx
].
fy
;
...
...
@@ -827,5 +831,5 @@ __kernel void computeGBSAForce1(
}
pos
++
;
}
energyBuffer[
get_global_id
(
0
)
]
+=
energy
;
energyBuffer
[
GLOBAL_ID
]
+=
energy
;
}
platforms/c
uda
/src/kernels/harmonicAngleForce.c
u
→
platforms/c
ommon
/src/kernels/harmonicAngleForce.c
c
View file @
5a06df78
File moved
platforms/c
uda
/src/kernels/harmonicBondForce.c
u
→
platforms/c
ommon
/src/kernels/harmonicBondForce.c
c
View file @
5a06df78
File moved
platforms/c
uda
/src/kernels/integrationUtilities.c
u
→
platforms/c
ommon
/src/kernels/integrationUtilities.c
c
View file @
5a06df78
/**
* Generate random numbers
*/
extern
"C"
__global__
void
generateRandomNumbers
(
int
numValues
,
float4
*
__restrict__
random
,
uint4
*
__restrict__
seed
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
KERNEL
void
generateRandomNumbers
(
int
numValues
,
GLOBAL
float4
*
RESTRICT
random
,
GLOBAL
uint4
*
RESTRICT
seed
)
{
int
index
=
GLOBAL_ID
;
uint4
state
=
seed
[
index
];
unsigned
int
carry
=
0
;
while
(
index
<
numValues
)
{
...
...
@@ -63,15 +63,15 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
// Record the values.
random
[
index
]
=
value
;
index
+=
blockDim
.
x
*
gridDim
.
x
;
index
+=
GLOBAL_SIZE
;
}
seed
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
=
state
;
seed
[
GLOBAL_ID
]
=
state
;
}
/**
* Load the position of a particle.
*/
inline
__device__
mixed4
loadPos
(
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
posqCorrection
,
int
index
)
{
inline
DEVICE
mixed4
loadPos
(
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
real4
*
RESTRICT
posqCorrection
,
int
index
)
{
#ifdef USE_MIXED_PRECISION
real4
pos1
=
posq
[
index
];
real4
pos2
=
posqCorrection
[
index
];
...
...
@@ -84,7 +84,7 @@ inline __device__ mixed4 loadPos(const real4* __restrict__ posq, const real4* __
/**
* Store the position of a particle.
*/
inline
__device__
void
storePos
(
real4
*
__restrict__
posq
,
real4
*
__restrict__
posqCorrection
,
int
index
,
mixed4
pos
)
{
inline
DEVICE
void
storePos
(
GLOBAL
real4
*
RESTRICT
posq
,
GLOBAL
real4
*
RESTRICT
posqCorrection
,
int
index
,
mixed4
pos
)
{
#ifdef USE_MIXED_PRECISION
posq
[
index
]
=
make_real4
((
real
)
pos
.
x
,
(
real
)
pos
.
y
,
(
real
)
pos
.
z
,
(
real
)
pos
.
w
);
posqCorrection
[
index
]
=
make_real4
(
pos
.
x
-
(
real
)
pos
.
x
,
pos
.
y
-
(
real
)
pos
.
y
,
pos
.
z
-
(
real
)
pos
.
z
,
0
);
...
...
@@ -96,16 +96,24 @@ inline __device__ void storePos(real4* __restrict__ posq, real4* __restrict__ po
/**
* Enforce constraints on SHAKE clusters
*/
extern
"C"
__global__
void
applyShakeToPositions
(
int
numClusters
,
mixed
tol
,
const
real4
*
__restrict__
oldPos
,
real4
*
__restrict__
posCorrection
,
mixed4
*
__restrict__
posDelta
,
const
int4
*
__restrict__
clusterAtoms
,
const
float4
*
__restrict__
clusterParams
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
KERNEL
void
applyShakeToPositions
(
int
numClusters
,
mixed
tol
,
GLOBAL
const
real4
*
RESTRICT
oldPos
,
GLOBAL
mixed4
*
RESTRICT
posDelta
,
GLOBAL
const
int4
*
RESTRICT
clusterAtoms
,
GLOBAL
const
float4
*
RESTRICT
clusterParams
#ifdef USE_MIXED_PRECISION
,
GLOBAL
const
real4
*
RESTRICT
posqCorrection
#endif
)
{
#ifndef USE_MIXED_PRECISION
GLOBAL
real4
*
posqCorrection
=
0
;
#endif
int
index
=
GLOBAL_ID
;
while
(
index
<
numClusters
)
{
// Load the data for this cluster.
int4
atoms
=
clusterAtoms
[
index
];
float4
params
=
clusterParams
[
index
];
mixed4
pos
=
loadPos
(
oldPos
,
posCorrection
,
atoms
.
x
);
mixed4
pos
=
loadPos
(
oldPos
,
pos
q
Correction
,
atoms
.
x
);
mixed4
xpi
=
posDelta
[
atoms
.
x
];
mixed4
pos1
=
loadPos
(
oldPos
,
posCorrection
,
atoms
.
y
);
mixed4
pos1
=
loadPos
(
oldPos
,
pos
q
Correction
,
atoms
.
y
);
mixed4
xpj1
=
posDelta
[
atoms
.
y
];
mixed4
pos2
=
make_mixed4
(
0
);
mixed4
xpj2
=
make_mixed4
(
0
);
...
...
@@ -114,13 +122,13 @@ extern "C" __global__ void applyShakeToPositions(int numClusters, mixed tol, con
float
d2
=
params
.
z
;
float
invMassPeripheral
=
params
.
w
;
if
(
atoms
.
z
!=
-
1
)
{
pos2
=
loadPos
(
oldPos
,
posCorrection
,
atoms
.
z
);
pos2
=
loadPos
(
oldPos
,
pos
q
Correction
,
atoms
.
z
);
xpj2
=
posDelta
[
atoms
.
z
];
}
mixed4
pos3
=
make_mixed4
(
0
);
mixed4
xpj3
=
make_mixed4
(
0
);
if
(
atoms
.
w
!=
-
1
)
{
pos3
=
loadPos
(
oldPos
,
posCorrection
,
atoms
.
w
);
pos3
=
loadPos
(
oldPos
,
pos
q
Correction
,
atoms
.
w
);
xpj3
=
posDelta
[
atoms
.
w
];
}
...
...
@@ -202,23 +210,31 @@ extern "C" __global__ void applyShakeToPositions(int numClusters, mixed tol, con
posDelta
[
atoms
.
z
]
=
xpj2
;
if
(
atoms
.
w
!=
-
1
)
posDelta
[
atoms
.
w
]
=
xpj3
;
index
+=
blockDim
.
x
*
gridDim
.
x
;
index
+=
GLOBAL_SIZE
;
}
}
/**
* Enforce velocity constraints on SHAKE clusters
*/
extern
"C"
__global__
void
applyShakeToVelocities
(
int
numClusters
,
mixed
tol
,
const
real4
*
__restrict__
oldPos
,
real4
*
__restrict__
posCorrection
,
mixed4
*
__restrict__
posDelta
,
const
int4
*
__restrict__
clusterAtoms
,
const
float4
*
__restrict__
clusterParams
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
KERNEL
void
applyShakeToVelocities
(
int
numClusters
,
mixed
tol
,
GLOBAL
const
real4
*
RESTRICT
oldPos
,
GLOBAL
mixed4
*
RESTRICT
posDelta
,
GLOBAL
const
int4
*
RESTRICT
clusterAtoms
,
GLOBAL
const
float4
*
RESTRICT
clusterParams
#ifdef USE_MIXED_PRECISION
,
GLOBAL
const
real4
*
RESTRICT
posqCorrection
#endif
)
{
#ifndef USE_MIXED_PRECISION
GLOBAL
real4
*
posqCorrection
=
0
;
#endif
int
index
=
GLOBAL_ID
;
while
(
index
<
numClusters
)
{
// Load the data for this cluster.
int4
atoms
=
clusterAtoms
[
index
];
float4
params
=
clusterParams
[
index
];
mixed4
pos
=
loadPos
(
oldPos
,
posCorrection
,
atoms
.
x
);
mixed4
pos
=
loadPos
(
oldPos
,
pos
q
Correction
,
atoms
.
x
);
mixed4
xpi
=
posDelta
[
atoms
.
x
];
mixed4
pos1
=
loadPos
(
oldPos
,
posCorrection
,
atoms
.
y
);
mixed4
pos1
=
loadPos
(
oldPos
,
pos
q
Correction
,
atoms
.
y
);
mixed4
xpj1
=
posDelta
[
atoms
.
y
];
mixed4
pos2
=
make_mixed4
(
0
);
mixed4
xpj2
=
make_mixed4
(
0
);
...
...
@@ -226,13 +242,13 @@ extern "C" __global__ void applyShakeToVelocities(int numClusters, mixed tol, co
float
avgMass
=
params
.
y
;
float
invMassPeripheral
=
params
.
w
;
if
(
atoms
.
z
!=
-
1
)
{
pos2
=
loadPos
(
oldPos
,
posCorrection
,
atoms
.
z
);
pos2
=
loadPos
(
oldPos
,
pos
q
Correction
,
atoms
.
z
);
xpj2
=
posDelta
[
atoms
.
z
];
}
mixed4
pos3
=
make_mixed4
(
0
);
mixed4
xpj3
=
make_mixed4
(
0
);
if
(
atoms
.
w
!=
-
1
)
{
pos3
=
loadPos
(
oldPos
,
posCorrection
,
atoms
.
w
);
pos3
=
loadPos
(
oldPos
,
pos
q
Correction
,
atoms
.
w
);
xpj3
=
posDelta
[
atoms
.
w
];
}
...
...
@@ -302,25 +318,34 @@ extern "C" __global__ void applyShakeToVelocities(int numClusters, mixed tol, co
posDelta
[
atoms
.
z
]
=
xpj2
;
if
(
atoms
.
w
!=
-
1
)
posDelta
[
atoms
.
w
]
=
xpj3
;
index
+=
blockDim
.
x
*
gridDim
.
x
;
index
+=
GLOBAL_SIZE
;
}
}
/**
* Enforce constraints on SETTLE clusters
*/
extern
"C"
__global__
void
applySettleToPositions
(
int
numClusters
,
mixed
tol
,
const
real4
*
__restrict__
oldPos
,
real4
*
__restrict__
posCorrection
,
mixed4
*
__restrict__
posDelta
,
const
mixed4
*
__restrict__
velm
,
const
int4
*
__restrict__
clusterAtoms
,
const
float2
*
__restrict__
clusterParams
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
KERNEL
void
applySettleToPositions
(
int
numClusters
,
mixed
tol
,
GLOBAL
const
real4
*
RESTRICT
oldPos
,
GLOBAL
mixed4
*
RESTRICT
posDelta
,
GLOBAL
const
mixed4
*
RESTRICT
velm
,
GLOBAL
const
int4
*
RESTRICT
clusterAtoms
,
GLOBAL
const
float2
*
RESTRICT
clusterParams
#ifdef USE_MIXED_PRECISION
,
GLOBAL
const
real4
*
RESTRICT
posqCorrection
#endif
)
{
#ifndef USE_MIXED_PRECISION
GLOBAL
real4
*
posqCorrection
=
0
;
#endif
int
index
=
GLOBAL_ID
;
while
(
index
<
numClusters
)
{
// Load the data for this cluster.
int4
atoms
=
clusterAtoms
[
index
];
float2
params
=
clusterParams
[
index
];
mixed4
apos0
=
loadPos
(
oldPos
,
posCorrection
,
atoms
.
x
);
mixed4
apos0
=
loadPos
(
oldPos
,
pos
q
Correction
,
atoms
.
x
);
mixed4
xp0
=
posDelta
[
atoms
.
x
];
mixed4
apos1
=
loadPos
(
oldPos
,
posCorrection
,
atoms
.
y
);
mixed4
apos1
=
loadPos
(
oldPos
,
pos
q
Correction
,
atoms
.
y
);
mixed4
xp1
=
posDelta
[
atoms
.
y
];
mixed4
apos2
=
loadPos
(
oldPos
,
posCorrection
,
atoms
.
z
);
mixed4
apos2
=
loadPos
(
oldPos
,
pos
q
Correction
,
atoms
.
z
);
mixed4
xp2
=
posDelta
[
atoms
.
z
];
mixed
m0
=
1
/
velm
[
atoms
.
x
].
w
;
mixed
m1
=
1
/
velm
[
atoms
.
y
].
w
;
...
...
@@ -454,21 +479,30 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
posDelta
[
atoms
.
x
]
=
xp0
;
posDelta
[
atoms
.
y
]
=
xp1
;
posDelta
[
atoms
.
z
]
=
xp2
;
index
+=
blockDim
.
x
*
gridDim
.
x
;
index
+=
GLOBAL_SIZE
;
}
}
/**
* Enforce velocity constraints on SETTLE clusters
*/
extern
"C"
__global__
void
applySettleToVelocities
(
int
numClusters
,
mixed
tol
,
const
real4
*
__restrict__
oldPos
,
real4
*
__restrict__
posCorrection
,
mixed4
*
__restrict__
posDelta
,
mixed4
*
__restrict__
velm
,
const
int4
*
__restrict__
clusterAtoms
,
const
float2
*
__restrict__
clusterParams
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numClusters
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
KERNEL
void
applySettleToVelocities
(
int
numClusters
,
mixed
tol
,
GLOBAL
const
real4
*
RESTRICT
oldPos
,
GLOBAL
mixed4
*
RESTRICT
posDelta
,
GLOBAL
mixed4
*
RESTRICT
velm
,
GLOBAL
const
int4
*
RESTRICT
clusterAtoms
,
GLOBAL
const
float2
*
RESTRICT
clusterParams
#ifdef USE_MIXED_PRECISION
,
GLOBAL
const
real4
*
RESTRICT
posqCorrection
#endif
)
{
#ifndef USE_MIXED_PRECISION
GLOBAL
real4
*
posqCorrection
=
0
;
#endif
for
(
int
index
=
GLOBAL_ID
;
index
<
numClusters
;
index
+=
GLOBAL_SIZE
)
{
// Load the data for this cluster.
int4
atoms
=
clusterAtoms
[
index
];
mixed4
apos0
=
loadPos
(
oldPos
,
posCorrection
,
atoms
.
x
);
mixed4
apos1
=
loadPos
(
oldPos
,
posCorrection
,
atoms
.
y
);
mixed4
apos2
=
loadPos
(
oldPos
,
posCorrection
,
atoms
.
z
);
mixed4
apos0
=
loadPos
(
oldPos
,
pos
q
Correction
,
atoms
.
x
);
mixed4
apos1
=
loadPos
(
oldPos
,
pos
q
Correction
,
atoms
.
y
);
mixed4
apos2
=
loadPos
(
oldPos
,
pos
q
Correction
,
atoms
.
z
);
mixed4
v0
=
velm
[
atoms
.
x
];
mixed4
v1
=
velm
[
atoms
.
y
];
mixed4
v2
=
velm
[
atoms
.
z
];
...
...
@@ -522,9 +556,16 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
/**
* Compute the direction each CCMA constraint is pointing in. This is called once at the beginning of constraint evaluation.
*/
extern
"C"
__global__
void
computeCCMAConstraintDirections
(
const
int2
*
__restrict__
constraintAtoms
,
mixed4
*
__restrict__
constraintDistance
,
const
real4
*
__restrict__
atomPositions
,
const
real4
*
__restrict__
posqCorrection
,
int
*
__restrict__
converged
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
KERNEL
void
computeCCMAConstraintDirections
(
GLOBAL
const
int2
*
RESTRICT
constraintAtoms
,
GLOBAL
mixed4
*
RESTRICT
constraintDistance
,
GLOBAL
const
real4
*
RESTRICT
atomPositions
,
GLOBAL
int
*
RESTRICT
converged
#ifdef USE_MIXED_PRECISION
,
GLOBAL
const
real4
*
RESTRICT
posqCorrection
#endif
)
{
#ifndef USE_MIXED_PRECISION
GLOBAL
real4
*
posqCorrection
=
0
;
#endif
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
GLOBAL_SIZE
)
{
// Compute the direction for this constraint.
int2
atoms
=
constraintAtoms
[
index
];
...
...
@@ -536,7 +577,7 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
dir
.
z
=
oldPos1
.
z
-
oldPos2
.
z
;
constraintDistance
[
index
]
=
dir
;
}
if
(
threadIdx
.
x
==
0
&&
blockIdx
.
x
==
0
)
{
if
(
GLOBAL_ID
==
0
)
{
converged
[
0
]
=
1
;
converged
[
1
]
=
0
;
}
...
...
@@ -545,23 +586,24 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
/**
* Compute the force applied by each CCMA position constraint.
*/
extern
"C"
__global__
void
computeCCMAPositionConstraintForce
(
const
int2
*
__restrict__
constraintAtoms
,
const
mixed4
*
__restrict__
constraintDistance
,
const
mixed4
*
__restrict__
atomPositions
,
const
mixed
*
__restrict__
reducedMass
,
mixed
*
__restrict__
delta1
,
int
*
__restrict__
converged
,
int
*
__restrict__
hostConvergedFlag
,
mixed
tol
,
int
iteration
)
{
__shared__
int
groupConverged
;
KERNEL
void
computeCCMAPositionConstraintForce
(
GLOBAL
const
int2
*
RESTRICT
constraintAtoms
,
GLOBAL
const
mixed4
*
RESTRICT
constraintDistance
,
GLOBAL
const
mixed4
*
RESTRICT
atomPositions
,
GLOBAL
const
mixed
*
RESTRICT
reducedMass
,
GLOBAL
mixed
*
RESTRICT
delta1
,
GLOBAL
int
*
RESTRICT
converged
,
GLOBAL
int
*
RESTRICT
hostConvergedFlag
,
mixed
tol
,
int
iteration
)
{
LOCAL
int
groupConverged
;
if
(
converged
[
1
-
iteration
%
2
])
{
if
(
blockIdx
.
x
==
0
&&
threadIdx
.
x
==
0
)
{
if
(
GLOBAL_ID
==
0
)
{
converged
[
iteration
%
2
]
=
1
;
hostConvergedFlag
[
0
]
=
1
;
}
return
;
// The constraint iteration has already converged.
}
if
(
threadIdx
.
x
==
0
)
if
(
LOCAL_ID
==
0
)
groupConverged
=
1
;
__syncthreads
()
;
SYNC_THREADS
;
mixed
lowerTol
=
1
-
2
*
tol
+
tol
*
tol
;
mixed
upperTol
=
1
+
2
*
tol
+
tol
*
tol
;
bool
threadConverged
=
true
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
GLOBAL_SIZE
)
{
// Compute the force due to this constraint.
int2
atoms
=
constraintAtoms
[
index
];
...
...
@@ -580,28 +622,29 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
}
if
(
groupConverged
&&
!
threadConverged
)
groupConverged
=
0
;
__syncthreads
()
;
if
(
threadIdx
.
x
==
0
&&
!
groupConverged
)
SYNC_THREADS
;
if
(
LOCAL_ID
==
0
&&
!
groupConverged
)
converged
[
iteration
%
2
]
=
0
;
}
/**
* Compute the force applied by each CCMA velocity constraint.
*/
extern
"C"
__global__
void
computeCCMAVelocityConstraintForce
(
const
int2
*
__restrict__
constraintAtoms
,
const
mixed4
*
__restrict__
constraintDistance
,
const
mixed4
*
__restrict__
atomPositions
,
const
mixed
*
__restrict__
reducedMass
,
mixed
*
__restrict__
delta1
,
int
*
__restrict__
converged
,
int
*
__restrict__
hostConvergedFlag
,
mixed
tol
,
int
iteration
)
{
__shared__
int
groupConverged
;
KERNEL
void
computeCCMAVelocityConstraintForce
(
GLOBAL
const
int2
*
RESTRICT
constraintAtoms
,
GLOBAL
const
mixed4
*
RESTRICT
constraintDistance
,
GLOBAL
const
mixed4
*
RESTRICT
atomPositions
,
GLOBAL
const
mixed
*
RESTRICT
reducedMass
,
GLOBAL
mixed
*
RESTRICT
delta1
,
GLOBAL
int
*
RESTRICT
converged
,
GLOBAL
int
*
RESTRICT
hostConvergedFlag
,
mixed
tol
,
int
iteration
)
{
LOCAL
int
groupConverged
;
if
(
converged
[
1
-
iteration
%
2
])
{
if
(
blockIdx
.
x
==
0
&&
threadIdx
.
x
==
0
)
{
if
(
GROUP_ID
==
0
&&
LOCAL_ID
==
0
)
{
converged
[
iteration
%
2
]
=
1
;
hostConvergedFlag
[
0
]
=
1
;
}
return
;
// The constraint iteration has already converged.
}
if
(
threadIdx
.
x
==
0
)
if
(
LOCAL_ID
==
0
)
groupConverged
=
1
;
__syncthreads
()
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
SYNC_THREADS
;
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
GLOBAL_SIZE
)
{
// Compute the force due to this constraint.
int2
atoms
=
constraintAtoms
[
index
];
...
...
@@ -623,14 +666,14 @@ extern "C" __global__ void computeCCMAVelocityConstraintForce(const int2* __rest
/**
* Multiply the vector of CCMA constraint forces by the constraint matrix.
*/
extern
"C"
__global__
void
multiplyByCCMAConstraintMatrix
(
const
mixed
*
__restrict__
delta1
,
mixed
*
__restrict__
delta2
,
const
int
*
__restrict__
constraintMatrixColumn
,
const
mixed
*
__restrict__
constraintMatrixValue
,
const
int
*
__restrict__
converged
,
int
iteration
)
{
KERNEL
void
multiplyByCCMAConstraintMatrix
(
GLOBAL
const
mixed
*
RESTRICT
delta1
,
GLOBAL
mixed
*
RESTRICT
delta2
,
GLOBAL
const
int
*
RESTRICT
constraintMatrixColumn
,
GLOBAL
const
mixed
*
RESTRICT
constraintMatrixValue
,
GLOBAL
const
int
*
RESTRICT
converged
,
int
iteration
)
{
if
(
converged
[
iteration
%
2
])
return
;
// The constraint iteration has already converged.
// Multiply by the inverse constraint matrix.
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
GLOBAL_SIZE
)
{
mixed
sum
=
0
;
for
(
int
i
=
0
;
;
i
++
)
{
int
element
=
index
+
i
*
NUM_CCMA_CONSTRAINTS
;
...
...
@@ -646,14 +689,15 @@ extern "C" __global__ void multiplyByCCMAConstraintMatrix(const mixed* __restric
/**
* Update the atom positions based on CCMA constraint forces.
*/
extern
"C"
__global__
void
updateCCMAAtomPositions
(
const
int
*
__restrict__
numAtomConstraints
,
const
int
*
__restrict__
atomConstraints
,
const
mixed4
*
__restrict__
constraintDistance
,
mixed4
*
__restrict__
atomPositions
,
const
mixed4
*
__restrict__
velm
,
const
mixed
*
__restrict__
delta1
,
const
mixed
*
__restrict__
delta2
,
int
*
__restrict__
converged
,
int
iteration
)
{
if
(
blockIdx
.
x
==
0
&&
threadIdx
.
x
==
0
)
KERNEL
void
updateCCMAAtomPositions
(
GLOBAL
const
int
*
RESTRICT
numAtomConstraints
,
GLOBAL
const
int
*
RESTRICT
atomConstraints
,
GLOBAL
const
mixed4
*
RESTRICT
constraintDistance
,
GLOBAL
mixed4
*
RESTRICT
atomPositions
,
GLOBAL
const
mixed4
*
RESTRICT
velm
,
GLOBAL
const
mixed
*
RESTRICT
delta1
,
GLOBAL
const
mixed
*
RESTRICT
delta2
,
GLOBAL
int
*
RESTRICT
converged
,
int
iteration
)
{
if
(
GROUP_ID
==
0
&&
LOCAL_ID
==
0
)
converged
[
1
-
iteration
%
2
]
=
1
;
if
(
converged
[
iteration
%
2
])
return
;
// The constraint iteration has already converged.
mixed
damping
=
(
iteration
<
2
?
0.5
f
:
1.0
f
);
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_ATOMS
;
index
+=
GLOBAL_SIZE
)
{
// Compute the new position of this atom.
mixed4
atomPos
=
atomPositions
[
index
];
...
...
@@ -677,16 +721,17 @@ extern "C" __global__ void updateCCMAAtomPositions(const int* __restrict__ numAt
/**
* Compute the positions of virtual sites
*/
extern
"C"
__global__
void
computeVirtualSites
(
real4
*
__restrict__
posq
,
real4
*
__restrict__
posqCorrection
,
const
int4
*
__restrict__
avg2Atoms
,
const
real2
*
__restrict__
avg2Weights
,
const
int4
*
__restrict__
avg3Atoms
,
const
real4
*
__restrict__
avg3Weights
,
const
int4
*
__restrict__
outOfPlaneAtoms
,
const
real4
*
__restrict__
outOfPlaneWeights
,
const
int
*
__restrict__
localCoordsIndex
,
const
int
*
__restrict__
localCoordsAtoms
,
const
real
*
__restrict__
localCoordsWeights
,
const
real4
*
__restrict__
localCoordsPos
,
const
int
*
__restrict__
localCoordsStartIndex
)
{
KERNEL
void
computeVirtualSites
(
GLOBAL
real4
*
RESTRICT
posq
,
GLOBAL
real4
*
RESTRICT
posqCorrection
,
GLOBAL
const
int4
*
RESTRICT
avg2Atoms
,
GLOBAL
const
real2
*
RESTRICT
avg2Weights
,
GLOBAL
const
int4
*
RESTRICT
avg3Atoms
,
GLOBAL
const
real4
*
RESTRICT
avg3Weights
,
GLOBAL
const
int4
*
RESTRICT
outOfPlaneAtoms
,
GLOBAL
const
real4
*
RESTRICT
outOfPlaneWeights
,
GLOBAL
const
int
*
RESTRICT
localCoordsIndex
,
GLOBAL
const
int
*
RESTRICT
localCoordsAtoms
,
GLOBAL
const
real
*
RESTRICT
localCoordsWeights
,
GLOBAL
const
real4
*
RESTRICT
localCoordsPos
,
GLOBAL
const
int
*
RESTRICT
localCoordsStartIndex
)
{
// Two particle average sites.
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_2_AVERAGE
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_2_AVERAGE
;
index
+=
GLOBAL_SIZE
)
{
int4
atoms
=
avg2Atoms
[
index
];
real2
weights
=
avg2Weights
[
index
];
mixed4
pos
=
loadPos
(
posq
,
posqCorrection
,
atoms
.
x
);
...
...
@@ -700,7 +745,7 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
// Three particle average sites.
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_3_AVERAGE
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_3_AVERAGE
;
index
+=
GLOBAL_SIZE
)
{
int4
atoms
=
avg3Atoms
[
index
];
real4
weights
=
avg3Weights
[
index
];
mixed4
pos
=
loadPos
(
posq
,
posqCorrection
,
atoms
.
x
);
...
...
@@ -715,7 +760,7 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
// Out of plane sites.
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_OUT_OF_PLANE
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_OUT_OF_PLANE
;
index
+=
GLOBAL_SIZE
)
{
int4
atoms
=
outOfPlaneAtoms
[
index
];
real4
weights
=
outOfPlaneWeights
[
index
];
mixed4
pos
=
loadPos
(
posq
,
posqCorrection
,
atoms
.
x
);
...
...
@@ -724,7 +769,7 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
mixed4
pos3
=
loadPos
(
posq
,
posqCorrection
,
atoms
.
w
);
mixed4
v12
=
pos2
-
pos1
;
mixed4
v13
=
pos3
-
pos1
;
mixed
3
cr
=
cross
(
v12
,
v13
);
mixed
4
cr
=
cross
(
v12
,
v13
);
pos
.
x
=
pos1
.
x
+
v12
.
x
*
weights
.
x
+
v13
.
x
*
weights
.
y
+
cr
.
x
*
weights
.
z
;
pos
.
y
=
pos1
.
y
+
v12
.
y
*
weights
.
x
+
v13
.
y
*
weights
.
y
+
cr
.
y
*
weights
.
z
;
pos
.
z
=
pos1
.
z
+
v12
.
z
*
weights
.
x
+
v13
.
z
*
weights
.
y
+
cr
.
z
*
weights
.
z
;
...
...
@@ -733,7 +778,7 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
// Local coordinates sites.
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_LOCAL_COORDS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_LOCAL_COORDS
;
index
+=
GLOBAL_SIZE
)
{
int
siteAtomIndex
=
localCoordsIndex
[
index
];
int
start
=
localCoordsStartIndex
[
index
];
int
end
=
localCoordsStartIndex
[
index
+
1
];
...
...
@@ -762,32 +807,38 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
}
}
inline
__device__
real3
loadForce
(
int
index
,
long
long
*
__restrict__
force
)
{
inline
DEVICE
real3
loadForce
(
int
index
,
GLOBAL
const
mm_long
*
RESTRICT
force
)
{
real
scale
=
1
/
((
real
)
0x100000000
);
return
make_real3
(
scale
*
force
[
index
],
scale
*
force
[
index
+
PADDED_NUM_ATOMS
],
scale
*
force
[
index
+
PADDED_NUM_ATOMS
*
2
]);
}
inline
__device__
void
addForce
(
int
index
,
long
long
*
__restrict__
force
,
real3
value
)
{
unsigned
long
long
*
f
=
(
unsigned
long
long
*
)
force
;
atomicAdd
(
&
f
[
index
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
.
x
*
0x100000000
)));
atomicAdd
(
&
f
[
index
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
.
y
*
0x100000000
)));
atomicAdd
(
&
f
[
index
+
PADDED_NUM_ATOMS
*
2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
.
z
*
0x100000000
)));
inline
DEVICE
void
addForce
(
int
index
,
GLOBAL
mm_long
*
RESTRICT
force
,
real3
value
)
{
GLOBAL
mm_ulong
*
f
=
(
GLOBAL
mm_ulong
*
)
force
;
#ifdef HAS_OVERLAPPING_VSITES
ATOMIC_ADD
(
&
f
[
index
],
(
mm_ulong
)
((
mm_long
)
(
value
.
x
*
0x100000000
)));
ATOMIC_ADD
(
&
f
[
index
+
PADDED_NUM_ATOMS
],
(
mm_ulong
)
((
mm_long
)
(
value
.
y
*
0x100000000
)));
ATOMIC_ADD
(
&
f
[
index
+
PADDED_NUM_ATOMS
*
2
],
(
mm_ulong
)
((
mm_long
)
(
value
.
z
*
0x100000000
)));
#else
f
[
index
]
+=
(
mm_ulong
)
((
mm_long
)
(
value
.
x
*
0x100000000
));
f
[
index
+
PADDED_NUM_ATOMS
]
+=
(
mm_ulong
)
((
mm_long
)
(
value
.
y
*
0x100000000
));
f
[
index
+
PADDED_NUM_ATOMS
*
2
]
+=
(
mm_ulong
)
((
mm_long
)
(
value
.
z
*
0x100000000
));
#endif
}
/**
* Distribute forces from virtual sites to the atoms they are based on.
*/
extern
"C"
__global__
void
distributeVirtualSiteForces
(
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
posqCorrection
,
long
long
*
__restrict__
force
,
const
int4
*
__restrict__
avg2Atoms
,
const
real2
*
__restrict__
avg2Weights
,
const
int4
*
__restrict__
avg3Atoms
,
const
real4
*
__restrict__
avg3Weights
,
const
int4
*
__restrict__
outOfPlaneAtoms
,
const
real4
*
__restrict__
outOfPlaneWeights
,
const
int
*
__restrict__
localCoordsIndex
,
const
int
*
__restrict__
localCoordsAtoms
,
const
real
*
__restrict__
localCoordsWeights
,
const
real4
*
__restrict__
localCoordsPos
,
const
int
*
__restrict__
localCoordsStartIndex
)
{
KERNEL
void
distributeVirtualSiteForces
(
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
real4
*
RESTRICT
posqCorrection
,
GLOBAL
mm_long
*
RESTRICT
force
,
GLOBAL
const
int4
*
RESTRICT
avg2Atoms
,
GLOBAL
const
real2
*
RESTRICT
avg2Weights
,
GLOBAL
const
int4
*
RESTRICT
avg3Atoms
,
GLOBAL
const
real4
*
RESTRICT
avg3Weights
,
GLOBAL
const
int4
*
RESTRICT
outOfPlaneAtoms
,
GLOBAL
const
real4
*
RESTRICT
outOfPlaneWeights
,
GLOBAL
const
int
*
RESTRICT
localCoordsIndex
,
GLOBAL
const
int
*
RESTRICT
localCoordsAtoms
,
GLOBAL
const
real
*
RESTRICT
localCoordsWeights
,
GLOBAL
const
real4
*
RESTRICT
localCoordsPos
,
GLOBAL
const
int
*
RESTRICT
localCoordsStartIndex
)
{
// Two particle average sites.
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_2_AVERAGE
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_2_AVERAGE
;
index
+=
GLOBAL_SIZE
)
{
int4
atoms
=
avg2Atoms
[
index
];
real2
weights
=
avg2Weights
[
index
];
real3
f
=
loadForce
(
atoms
.
x
,
force
);
...
...
@@ -797,7 +848,7 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
// Three particle average sites.
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_3_AVERAGE
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_3_AVERAGE
;
index
+=
GLOBAL_SIZE
)
{
int4
atoms
=
avg3Atoms
[
index
];
real4
weights
=
avg3Weights
[
index
];
real3
f
=
loadForce
(
atoms
.
x
,
force
);
...
...
@@ -808,7 +859,7 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
// Out of plane sites.
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_OUT_OF_PLANE
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_OUT_OF_PLANE
;
index
+=
GLOBAL_SIZE
)
{
int4
atoms
=
outOfPlaneAtoms
[
index
];
real4
weights
=
outOfPlaneWeights
[
index
];
mixed4
pos1
=
loadPos
(
posq
,
posqCorrection
,
atoms
.
y
);
...
...
@@ -830,7 +881,7 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
// Local coordinates sites.
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_LOCAL_COORDS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_LOCAL_COORDS
;
index
+=
GLOBAL_SIZE
)
{
int
siteAtomIndex
=
localCoordsIndex
[
index
];
int
start
=
localCoordsStartIndex
[
index
];
int
end
=
localCoordsStartIndex
[
index
+
1
];
...
...
@@ -884,12 +935,22 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
}
}
/**
* Copy the distributed forces from the long buffer back to the float buffer.
*/
KERNEL
void
saveDistributedForces
(
GLOBAL
const
mm_long
*
RESTRICT
longForces
,
GLOBAL
real4
*
RESTRICT
forces
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_ATOMS
;
index
+=
GLOBAL_SIZE
)
{
real3
f
=
loadForce
(
index
,
longForces
);
forces
[
index
]
=
make_real4
(
f
.
x
,
f
.
y
,
f
.
z
,
0
);
}
}
/**
* Apply a time shift to the velocities before computing kinetic energy.
*/
extern
"C"
__global__
void
timeShiftVelocities
(
mixed4
*
__restrict__
velm
,
const
long
long
*
__restrict__
force
,
real
timeShift
)
{
KERNEL
void
timeShiftVelocities
(
GLOBAL
mixed4
*
RESTRICT
velm
,
GLOBAL
const
mm_
long
*
RESTRICT
force
,
real
timeShift
)
{
const
mixed
scale
=
timeShift
/
(
mixed
)
0x100000000
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
NUM_ATOMS
;
index
+=
GLOBAL_SIZE
)
{
mixed4
velocity
=
velm
[
index
];
if
(
velocity
.
w
!=
0.0
)
{
velocity
.
x
+=
scale
*
force
[
index
]
*
velocity
.
w
;
...
...
platforms/c
uda
/src/kernels/langevin.c
u
→
platforms/c
ommon
/src/kernels/langevin.c
c
View file @
5a06df78
...
...
@@ -4,13 +4,13 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams};
* Perform the first step of Langevin integration.
*/
extern
"C"
__global__
void
integrateLangevinPart1
(
int
numAtoms
,
int
paddedNumAtoms
,
mixed4
*
__restrict__
velm
,
const
long
long
*
__restrict__
force
,
mixed4
*
__restrict__
posDelta
,
const
mixed
*
__restrict__
paramBuffer
,
const
mixed2
*
__restrict__
dt
,
const
float4
*
__restrict__
random
,
unsigned
int
randomIndex
)
{
KERNEL
void
integrateLangevinPart1
(
int
numAtoms
,
int
paddedNumAtoms
,
GLOBAL
mixed4
*
RESTRICT
velm
,
GLOBAL
const
mm_
long
*
RESTRICT
force
,
GLOBAL
mixed4
*
RESTRICT
posDelta
,
GLOBAL
const
mixed
*
RESTRICT
paramBuffer
,
GLOBAL
const
mixed2
*
RESTRICT
dt
,
GLOBAL
const
float4
*
RESTRICT
random
,
unsigned
int
randomIndex
)
{
mixed
vscale
=
paramBuffer
[
VelScale
];
mixed
fscale
=
paramBuffer
[
ForceScale
]
/
(
mixed
)
0x100000000
;
mixed
noisescale
=
paramBuffer
[
NoiseScale
];
mixed
stepSize
=
dt
[
0
].
y
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
GLOBAL_ID
;
randomIndex
+=
index
;
while
(
index
<
numAtoms
)
{
mixed4
velocity
=
velm
[
index
];
...
...
@@ -22,8 +22,8 @@ extern "C" __global__ void integrateLangevinPart1(int numAtoms, int paddedNumAto
velm
[
index
]
=
velocity
;
posDelta
[
index
]
=
make_mixed4
(
stepSize
*
velocity
.
x
,
stepSize
*
velocity
.
y
,
stepSize
*
velocity
.
z
,
0
);
}
randomIndex
+=
blockDim
.
x
*
gridDim
.
x
;
index
+=
blockDim
.
x
*
gridDim
.
x
;
randomIndex
+=
GLOBAL_SIZE
;
index
+=
GLOBAL_SIZE
;
}
}
...
...
@@ -31,14 +31,18 @@ extern "C" __global__ void integrateLangevinPart1(int numAtoms, int paddedNumAto
* Perform the second step of Langevin integration.
*/
extern
"C"
__global__
void
integrateLangevinPart2
(
int
numAtoms
,
real4
*
__restrict__
posq
,
real4
*
__restrict__
posqCorrection
,
const
mixed4
*
__restrict__
posDelta
,
mixed4
*
__restrict__
velm
,
const
mixed2
*
__restrict__
dt
)
{
#if __CUDA_ARCH__ >= 130
KERNEL
void
integrateLangevinPart2
(
int
numAtoms
,
GLOBAL
real4
*
RESTRICT
posq
,
GLOBAL
const
mixed4
*
RESTRICT
posDelta
,
GLOBAL
mixed4
*
RESTRICT
velm
,
GLOBAL
const
mixed2
*
RESTRICT
dt
#ifdef USE_MIXED_PRECISION
,
GLOBAL
real4
*
RESTRICT
posqCorrection
#endif
)
{
#ifdef SUPPORTS_DOUBLE_PRECISION
double
invStepSize
=
1.0
/
dt
[
0
].
y
;
#else
float
invStepSize
=
1.0
f
/
dt
[
0
].
y
;
float
correction
=
(
1.0
f
-
invStepSize
*
dt
[
0
].
y
)
/
dt
[
0
].
y
;
#endif
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
GLOBAL_ID
;
while
(
index
<
numAtoms
)
{
mixed4
vel
=
velm
[
index
];
if
(
vel
.
w
!=
0
)
{
...
...
@@ -53,7 +57,7 @@ extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restric
pos
.
x
+=
delta
.
x
;
pos
.
y
+=
delta
.
y
;
pos
.
z
+=
delta
.
z
;
#if
__CUDA_ARCH__ >= 130
#if
def SUPPORTS_DOUBLE_PRECISION
vel
.
x
=
(
mixed
)
(
invStepSize
*
delta
.
x
);
vel
.
y
=
(
mixed
)
(
invStepSize
*
delta
.
y
);
vel
.
z
=
(
mixed
)
(
invStepSize
*
delta
.
z
);
...
...
@@ -70,7 +74,7 @@ extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restric
#endif
velm
[
index
]
=
vel
;
}
index
+=
blockDim
.
x
*
gridDim
.
x
;
index
+=
GLOBAL_SIZE
;
}
}
...
...
@@ -78,32 +82,30 @@ extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restric
* Select the step size to use for the next step.
*/
extern
"C"
__global__
void
selectLangevinStepSize
(
int
numAtoms
,
int
paddedNumAtoms
,
mixed
maxStepSize
,
mixed
errorTol
,
mixed
friction
,
mixed
kT
,
mixed2
*
__restrict__
dt
,
const
mixed4
*
__restrict__
velm
,
const
long
long
*
__restrict__
force
,
mixed
*
__restrict__
paramBuffer
)
{
KERNEL
void
selectLangevinStepSize
(
int
numAtoms
,
int
paddedNumAtoms
,
mixed
maxStepSize
,
mixed
errorTol
,
mixed
friction
,
mixed
kT
,
GLOBAL
mixed2
*
RESTRICT
dt
,
GLOBAL
const
mixed4
*
RESTRICT
velm
,
GLOBAL
const
mm_
long
*
RESTRICT
force
,
GLOBAL
mixed
*
RESTRICT
paramBuffer
)
{
// Calculate the error.
extern
__shared__
mixed
params
[
];
mixed
*
error
=
&
params
[
MaxParams
];
LOCAL
mixed
error
[
256
];
LOCAL
mixed
params
[
MaxParams
];
mixed
err
=
0
;
unsigned
int
index
=
threadIdx
.
x
;
const
mixed
scale
=
RECIP
((
mixed
)
0x100000000
);
while
(
index
<
numAtoms
)
{
for
(
int
index
=
LOCAL_ID
;
index
<
numAtoms
;
index
+=
LOCAL_SIZE
)
{
mixed3
f
=
make_mixed3
(
scale
*
force
[
index
],
scale
*
force
[
index
+
paddedNumAtoms
],
scale
*
force
[
index
+
paddedNumAtoms
*
2
]);
mixed
invMass
=
velm
[
index
].
w
;
err
+=
(
f
.
x
*
f
.
x
+
f
.
y
*
f
.
y
+
f
.
z
*
f
.
z
)
*
invMass
*
invMass
;
index
+=
blockDim
.
x
*
gridDim
.
x
;
}
error
[
threadIdx
.
x
]
=
err
;
__syncthreads
()
;
error
[
LOCAL_ID
]
=
err
;
SYNC_THREADS
;
// Sum the errors from all threads.
for
(
unsigned
int
offset
=
1
;
offset
<
blockDim
.
x
;
offset
*=
2
)
{
if
(
threadIdx
.
x
+
offset
<
blockDim
.
x
&&
(
threadIdx
.
x
&
(
2
*
offset
-
1
))
==
0
)
error
[
threadIdx
.
x
]
+=
error
[
threadIdx
.
x
+
offset
];
__syncthreads
()
;
for
(
unsigned
int
offset
=
1
;
offset
<
LOCAL_SIZE
;
offset
*=
2
)
{
if
(
LOCAL_ID
+
offset
<
LOCAL_SIZE
&&
(
LOCAL_ID
&
(
2
*
offset
-
1
))
==
0
)
error
[
LOCAL_ID
]
+=
error
[
LOCAL_ID
+
offset
];
SYNC_THREADS
;
}
if
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
==
0
)
{
if
(
GLOBAL_ID
==
0
)
{
// Select the new step size.
mixed
totalError
=
SQRT
(
error
[
0
]
/
(
numAtoms
*
3
));
...
...
@@ -126,7 +128,7 @@ extern "C" __global__ void selectLangevinStepSize(int numAtoms, int paddedNumAto
params
[
ForceScale
]
=
fscale
;
params
[
NoiseScale
]
=
noisescale
;
}
__syncthreads
()
;
if
(
threadIdx
.
x
<
MaxParams
)
paramBuffer
[
threadIdx
.
x
]
=
params
[
threadIdx
.
x
];
SYNC_THREADS
;
if
(
LOCAL_ID
<
MaxParams
)
paramBuffer
[
LOCAL_ID
]
=
params
[
LOCAL_ID
];
}
platforms/c
uda
/src/kernels/
baoab
.c
u
→
platforms/c
ommon
/src/kernels/
langevinMiddle
.c
c
View file @
5a06df78
enum
{
VelScale
,
NoiseScale
};
/**
* Perform the first part of
BAOAB
integration: velocity
half step, then position half
step.
* Perform the first part of integration: velocity step.
*/
extern
"C"
__global__
void
integrateBAOABPart1
(
int
numAtoms
,
int
paddedNumAtoms
,
mixed4
*
__restrict__
velm
,
const
long
long
*
__restrict__
force
,
mixed4
*
__restrict__
posDelta
,
mixed4
*
__restrict__
oldDelta
,
const
mixed2
*
__restrict__
dt
)
{
mixed
halfdt
=
0.5
*
dt
[
0
].
y
;
mixed
fscale
=
halfdt
/
(
mixed
)
0x100000000
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numAtoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
KERNEL
void
integrateLangevinMiddlePart1
(
int
numAtoms
,
int
paddedNumAtoms
,
GLOBAL
mixed4
*
RESTRICT
velm
,
GLOBAL
const
mm_long
*
RESTRICT
force
,
GLOBAL
const
mixed2
*
RESTRICT
dt
)
{
mixed
fscale
=
dt
[
0
].
y
/
(
mixed
)
0x100000000
;
for
(
int
index
=
GLOBAL_ID
;
index
<
numAtoms
;
index
+=
GLOBAL_SIZE
)
{
mixed4
velocity
=
velm
[
index
];
if
(
velocity
.
w
!=
0.0
)
{
velocity
.
x
+=
fscale
*
velocity
.
w
*
force
[
index
];
velocity
.
y
+=
fscale
*
velocity
.
w
*
force
[
index
+
paddedNumAtoms
];
velocity
.
z
+=
fscale
*
velocity
.
w
*
force
[
index
+
paddedNumAtoms
*
2
];
velm
[
index
]
=
velocity
;
mixed4
delta
=
make_mixed4
(
halfdt
*
velocity
.
x
,
halfdt
*
velocity
.
y
,
halfdt
*
velocity
.
z
,
0
);
posDelta
[
index
]
=
delta
;
oldDelta
[
index
]
=
delta
;
}
}
}
/**
* Perform the second part of
BAOAB
integration:
apply constraint forces to velocities
, then interact with heat bath,
* then position half step.
* Perform the second part of integration:
position half step
, then interact with heat bath,
* then
another
position half step.
*/
extern
"C"
__global__
void
integrateBAOABPart2
(
int
numAtoms
,
real4
*
__restrict__
posq
,
real4
*
__restrict__
posqCorrection
,
mixed4
*
__restrict__
velm
,
mixed4
*
__restrict__
posDelta
,
mixed4
*
__restrict__
oldDelta
,
const
mixed
*
__restrict__
paramBuffer
,
const
mixed2
*
__restrict__
dt
,
const
float4
*
__restrict__
random
,
unsigned
int
randomIndex
)
{
KERNEL
void
integrateLangevinMiddlePart2
(
int
numAtoms
,
GLOBAL
mixed4
*
RESTRICT
velm
,
GLOBAL
mixed4
*
RESTRICT
posDelta
,
GLOBAL
mixed4
*
RESTRICT
oldDelta
,
GLOBAL
const
mixed
*
RESTRICT
paramBuffer
,
GLOBAL
const
mixed2
*
RESTRICT
dt
,
GLOBAL
const
float4
*
RESTRICT
random
,
unsigned
int
randomIndex
)
{
mixed
vscale
=
paramBuffer
[
VelScale
];
mixed
noisescale
=
paramBuffer
[
NoiseScale
];
mixed
halfdt
=
0.5
*
dt
[
0
].
y
;
mixed
invHalfdt
=
1
/
halfdt
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
GLOBAL_ID
;
randomIndex
+=
index
;
while
(
index
<
numAtoms
)
{
mixed4
velocity
=
velm
[
index
];
if
(
velocity
.
w
!=
0.0
)
{
mixed4
delta
=
posDelta
[
index
]
;
mixed4
delta
=
make_mixed4
(
halfdt
*
velocity
.
x
,
halfdt
*
velocity
.
y
,
halfdt
*
velocity
.
z
,
0
)
;
mixed
sqrtInvMass
=
SQRT
(
velocity
.
w
);
velocity
.
x
+=
(
delta
.
x
-
oldDelta
[
index
].
x
)
*
invHalfdt
;
velocity
.
y
+=
(
delta
.
y
-
oldDelta
[
index
].
y
)
*
invHalfdt
;
velocity
.
z
+=
(
delta
.
z
-
oldDelta
[
index
].
z
)
*
invHalfdt
;
velocity
.
x
=
vscale
*
velocity
.
x
+
noisescale
*
sqrtInvMass
*
random
[
randomIndex
].
x
;
velocity
.
y
=
vscale
*
velocity
.
y
+
noisescale
*
sqrtInvMass
*
random
[
randomIndex
].
y
;
velocity
.
z
=
vscale
*
velocity
.
z
+
noisescale
*
sqrtInvMass
*
random
[
randomIndex
].
z
;
velm
[
index
]
=
velocity
;
#ifdef USE_MIXED_PRECISION
real4
pos1
=
posq
[
index
];
real4
pos2
=
posqCorrection
[
index
];
mixed4
pos
=
make_mixed4
(
pos1
.
x
+
(
mixed
)
pos2
.
x
,
pos1
.
y
+
(
mixed
)
pos2
.
y
,
pos1
.
z
+
(
mixed
)
pos2
.
z
,
pos1
.
w
);
#else
real4
pos
=
posq
[
index
];
#endif
pos
.
x
+=
delta
.
x
;
pos
.
y
+=
delta
.
y
;
pos
.
z
+=
delta
.
z
;
#ifdef USE_MIXED_PRECISION
posq
[
index
]
=
make_real4
((
real
)
pos
.
x
,
(
real
)
pos
.
y
,
(
real
)
pos
.
z
,
(
real
)
pos
.
w
);
posqCorrection
[
index
]
=
make_real4
(
pos
.
x
-
(
real
)
pos
.
x
,
pos
.
y
-
(
real
)
pos
.
y
,
pos
.
z
-
(
real
)
pos
.
z
,
0
);
#else
posq
[
index
]
=
pos
;
#endif
delta
=
make_mixed4
(
halfdt
*
velocity
.
x
,
halfdt
*
velocity
.
y
,
halfdt
*
velocity
.
z
,
0
);
delta
+=
make_mixed4
(
halfdt
*
velocity
.
x
,
halfdt
*
velocity
.
y
,
halfdt
*
velocity
.
z
,
0
);
posDelta
[
index
]
=
delta
;
oldDelta
[
index
]
=
delta
;
}
randomIndex
+=
blockDim
.
x
*
gridDim
.
x
;
index
+=
blockDim
.
x
*
gridDim
.
x
;
randomIndex
+=
GLOBAL_SIZE
;
index
+=
GLOBAL_SIZE
;
}
}
/**
* Perform the third part of
BAOAB
integration: apply constraint forces to velocities, then record
* the constrained positions
in preparation for computing forces
.
* Perform the third part of integration: apply constraint forces to velocities, then record
* the constrained positions.
*/
extern
"C"
__global__
void
integrateBAOABPart3
(
int
numAtoms
,
real4
*
__restrict__
posq
,
real4
*
__restrict__
posqCorrection
,
mixed4
*
__restrict__
velm
,
mixed4
*
__restrict__
posDelta
,
mixed4
*
__restrict__
oldDelta
,
const
mixed2
*
__restrict__
dt
)
{
mixed
halfdt
=
0.5
*
dt
[
0
].
y
;
mixed
invHalfdt
=
1
/
halfdt
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numAtoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
KERNEL
void
integrateLangevinMiddlePart3
(
int
numAtoms
,
GLOBAL
real4
*
RESTRICT
posq
,
GLOBAL
mixed4
*
RESTRICT
velm
,
GLOBAL
mixed4
*
RESTRICT
posDelta
,
GLOBAL
mixed4
*
RESTRICT
oldDelta
,
GLOBAL
const
mixed2
*
RESTRICT
dt
#ifdef USE_MIXED_PRECISION
,
GLOBAL
real4
*
RESTRICT
posqCorrection
#endif
)
{
mixed
invDt
=
1
/
dt
[
0
].
y
;
for
(
int
index
=
GLOBAL_ID
;
index
<
numAtoms
;
index
+=
GLOBAL_SIZE
)
{
mixed4
velocity
=
velm
[
index
];
if
(
velocity
.
w
!=
0.0
)
{
mixed4
delta
=
posDelta
[
index
];
velocity
.
x
+=
(
delta
.
x
-
oldDelta
[
index
].
x
)
*
inv
Halfd
t
;
velocity
.
y
+=
(
delta
.
y
-
oldDelta
[
index
].
y
)
*
inv
Halfd
t
;
velocity
.
z
+=
(
delta
.
z
-
oldDelta
[
index
].
z
)
*
inv
Halfd
t
;
velocity
.
x
+=
(
delta
.
x
-
oldDelta
[
index
].
x
)
*
inv
D
t
;
velocity
.
y
+=
(
delta
.
y
-
oldDelta
[
index
].
y
)
*
inv
D
t
;
velocity
.
z
+=
(
delta
.
z
-
oldDelta
[
index
].
z
)
*
inv
D
t
;
velm
[
index
]
=
velocity
;
#ifdef USE_MIXED_PRECISION
real4
pos1
=
posq
[
index
];
...
...
@@ -108,22 +88,3 @@ extern "C" __global__ void integrateBAOABPart3(int numAtoms, real4* __restrict__
}
}
}
/**
* Perform the fourth part of BAOAB integration: velocity half step.
*/
extern
"C"
__global__
void
integrateBAOABPart4
(
int
numAtoms
,
int
paddedNumAtoms
,
mixed4
*
__restrict__
velm
,
const
long
long
*
__restrict__
force
,
const
mixed2
*
__restrict__
dt
)
{
mixed
halfdt
=
0.5
*
dt
[
0
].
y
;
mixed
fscale
=
halfdt
/
(
mixed
)
0x100000000
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numAtoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
mixed4
velocity
=
velm
[
index
];
if
(
velocity
.
w
!=
0.0
)
{
velocity
.
x
+=
fscale
*
velocity
.
w
*
force
[
index
];
velocity
.
y
+=
fscale
*
velocity
.
w
*
force
[
index
+
paddedNumAtoms
];
velocity
.
z
+=
fscale
*
velocity
.
w
*
force
[
index
+
paddedNumAtoms
*
2
];
velm
[
index
]
=
velocity
;
}
}
}
platforms/c
uda
/src/kernels/periodicTorsionForce.c
u
→
platforms/c
ommon
/src/kernels/periodicTorsionForce.c
c
View file @
5a06df78
File moved
platforms/c
uda
/src/kernels/rbTorsionForce.c
u
→
platforms/c
ommon
/src/kernels/rbTorsionForce.c
c
View file @
5a06df78
File moved
platforms/
opencl
/src/kernels/removeCM.c
l
→
platforms/
common
/src/kernels/removeCM.c
c
View file @
5a06df78
...
...
@@ -2,111 +2,79 @@
* Calculate the center of mass momentum.
*/
__kernel
void
calcCenterOfMassMomentum
(
int
numAtoms,
__global
const
mixed4*
restrict
velm,
__global
float
4
*
restrict
cmMomentum
,
__local
volatile
float4*
restrict
temp
)
{
int
index
=
get_global_id
(
0
)
;
float
4
cm
=
0.0f
;
while
(
index
<
numAtoms
)
{
KERNEL
void
calcCenterOfMassMomentum
(
int
numAtoms
,
GLOBAL
const
mixed4
*
RESTRICT
velm
,
GLOBAL
float
3
*
RESTRICT
cmMomentum
)
{
LOCAL
float3
temp
[
64
]
;
float
3
cm
=
make_float3
(
0
,
0
,
0
)
;
for
(
int
index
=
GLOBAL_ID
;
index
<
numAtoms
;
index
+=
GLOBAL_SIZE
)
{
mixed4
velocity
=
velm
[
index
];
if
(
velocity
.
w
!=
0
)
{
cm.x
+=
velocity.x/velocity.w
;
cm.y
+=
velocity.y/velocity.w
;
cm.z
+=
velocity.z/velocity.w
;
mixed
mass
=
RECIP
(
velocity
.
w
);
cm
.
x
+=
(
float
)
(
velocity
.
x
*
mass
);
cm
.
y
+=
(
float
)
(
velocity
.
y
*
mass
);
cm
.
z
+=
(
float
)
(
velocity
.
z
*
mass
);
}
index
+=
get_global_size
(
0
)
;
}
// Sum the threads in this group.
int
thread
=
get_local_id
(
0
)
;
int
thread
=
LOCAL_ID
;
temp
[
thread
]
=
cm
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
ifdef
WARPS_ARE_ATOMIC
if
(
thread
<
32
)
{
temp[thread]
+=
temp[thread+32]
;
if
(
thread
<
16
)
temp[thread]
+=
temp[thread+16]
;
if
(
thread
<
8
)
temp[thread]
+=
temp[thread+8]
;
if
(
thread
<
4
)
temp[thread]
+=
temp[thread+4]
;
if
(
thread
<
2
)
temp[thread]
+=
temp[thread+2]
;
}
#
else
SYNC_THREADS
;
if
(
thread
<
32
)
temp
[
thread
]
+=
temp
[
thread
+
32
];
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
SYNC_THREADS
;
if
(
thread
<
16
)
temp
[
thread
]
+=
temp
[
thread
+
16
];
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
SYNC_THREADS
;
if
(
thread
<
8
)
temp
[
thread
]
+=
temp
[
thread
+
8
];
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
SYNC_THREADS
;
if
(
thread
<
4
)
temp
[
thread
]
+=
temp
[
thread
+
4
];
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
SYNC_THREADS
;
if
(
thread
<
2
)
temp
[
thread
]
+=
temp
[
thread
+
2
];
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
endif
SYNC_THREADS
;
if
(
thread
==
0
)
cmMomentum[
get_group_id
(
0
)
]
=
temp[thread]+temp[thread+1]
;
cmMomentum
[
GROUP_ID
]
=
temp
[
thread
]
+
temp
[
thread
+
1
];
}
/**
* Remove center of mass motion.
*/
__kernel
void
removeCenterOfMassMomentum
(
unsigned
int
numAtoms,
__global
mixed4*
restrict
velm,
__global
const
float
4
*
restrict
cmMomentum
,
__local
volatile
float4*
restrict
temp
)
{
KERNEL
void
removeCenterOfMassMomentum
(
int
numAtoms
,
GLOBAL
mixed4
*
RESTRICT
velm
,
GLOBAL
const
float
3
*
RESTRICT
cmMomentum
)
{
// First sum all of the momenta that were calculated by individual groups.
unsigned
int
index
=
get_local_id
(
0
)
;
float
4
cm
=
0.0f
;
while
(
index
<
get_num_groups
(
0
))
{
LOCAL
float3
temp
[
64
]
;
float
3
cm
=
make_float3
(
0
,
0
,
0
)
;
for
(
int
index
=
LOCAL_ID
;
index
<
NUM_GROUPS
;
index
+=
LOCAL_SIZE
)
cm
+=
cmMomentum
[
index
];
index
+=
get_local_size
(
0
)
;
}
int
thread
=
get_local_id
(
0
)
;
int
thread
=
LOCAL_ID
;
temp
[
thread
]
=
cm
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
ifdef
WARPS_ARE_ATOMIC
if
(
thread
<
32
)
{
temp[thread]
+=
temp[thread+32]
;
if
(
thread
<
16
)
temp[thread]
+=
temp[thread+16]
;
if
(
thread
<
8
)
temp[thread]
+=
temp[thread+8]
;
if
(
thread
<
4
)
temp[thread]
+=
temp[thread+4]
;
if
(
thread
<
2
)
temp[thread]
+=
temp[thread+2]
;
}
#
else
SYNC_THREADS
;
if
(
thread
<
32
)
temp
[
thread
]
+=
temp
[
thread
+
32
];
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
SYNC_THREADS
;
if
(
thread
<
16
)
temp
[
thread
]
+=
temp
[
thread
+
16
];
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
SYNC_THREADS
;
if
(
thread
<
8
)
temp
[
thread
]
+=
temp
[
thread
+
8
];
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
SYNC_THREADS
;
if
(
thread
<
4
)
temp
[
thread
]
+=
temp
[
thread
+
4
];
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
SYNC_THREADS
;
if
(
thread
<
2
)
temp
[
thread
]
+=
temp
[
thread
+
2
];
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
cm
=
(
float4
)
(
INVERSE_TOTAL_MASS*
(
temp[0].x+temp[1].x
)
,
INVERSE_TOTAL_MASS*
(
temp[0].y+temp[1].y
)
,
INVERSE_TOTAL_MASS*
(
temp[0].z+temp[1].z
)
,
0
)
;
SYNC_THREADS
;
cm
=
make_float3
(
INVERSE_TOTAL_MASS
*
(
temp
[
0
].
x
+
temp
[
1
].
x
),
INVERSE_TOTAL_MASS
*
(
temp
[
0
].
y
+
temp
[
1
].
y
),
INVERSE_TOTAL_MASS
*
(
temp
[
0
].
z
+
temp
[
1
].
z
));
// Now remove the center of mass velocity from each atom.
index
=
get_global_id
(
0
)
;
while
(
index
<
numAtoms
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
numAtoms
;
index
+=
GLOBAL_SIZE
)
{
velm
[
index
].
x
-=
cm
.
x
;
velm
[
index
].
y
-=
cm
.
y
;
velm
[
index
].
z
-=
cm
.
z
;
index
+=
get_global_size
(
0
)
;
}
}
platforms/c
uda
/src/kernels/rmsd.c
u
→
platforms/c
ommon
/src/kernels/rmsd.c
c
View file @
5a06df78
...
...
@@ -4,20 +4,20 @@
/**
* Sum a value over all threads.
*/
__device__
real
reduceValue
(
real
value
,
volatile
real
*
temp
)
{
const
int
thread
=
threadIdx
.
x
;
__syncthreads
()
;
DEVICE
real
reduceValue
(
real
value
,
LOCAL_ARG
volatile
real
*
temp
)
{
const
int
thread
=
LOCAL_ID
;
SYNC_THREADS
;
temp
[
thread
]
=
value
;
__syncthreads
()
;
for
(
unsigned
int
step
=
1
;
step
<
32
;
step
*=
2
)
{
if
(
thread
+
step
<
blockDim
.
x
&&
thread
%
(
2
*
step
)
==
0
)
SYNC_THREADS
;
for
(
int
step
=
1
;
step
<
32
;
step
*=
2
)
{
if
(
thread
+
step
<
LOCAL_SIZE
&&
thread
%
(
2
*
step
)
==
0
)
temp
[
thread
]
=
temp
[
thread
]
+
temp
[
thread
+
step
];
SYNC_WARPS
SYNC_WARPS
;
}
for
(
unsigned
int
step
=
32
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
thread
+
step
<
blockDim
.
x
&&
thread
%
(
2
*
step
)
==
0
)
for
(
int
step
=
32
;
step
<
LOCAL_SIZE
;
step
*=
2
)
{
if
(
thread
+
step
<
LOCAL_SIZE
&&
thread
%
(
2
*
step
)
==
0
)
temp
[
thread
]
=
temp
[
thread
]
+
temp
[
thread
+
step
];
__syncthreads
()
;
SYNC_THREADS
;
}
return
temp
[
0
];
}
...
...
@@ -25,14 +25,14 @@ __device__ real reduceValue(real value, volatile real* temp) {
/**
* Perform the first step of computing the RMSD. This is executed as a single work group.
*/
extern
"C"
__global__
void
computeRMSDPart1
(
int
numParticles
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
referencePos
,
const
int
*
__restrict__
particles
,
real
*
buffer
)
{
extern
__shared__
volatile
real
temp
[];
KERNEL
void
computeRMSDPart1
(
int
numParticles
,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
real4
*
RESTRICT
referencePos
,
GLOBAL
const
int
*
RESTRICT
particles
,
GLOBAL
real
*
buffer
)
{
LOCAL
volatile
real
temp
[
THREAD_BLOCK_SIZE
];
// Compute the center of the particle positions.
real3
center
=
make_real3
(
0
);
for
(
int
i
=
threadIdx
.
x
;
i
<
numParticles
;
i
+=
blockDim
.
x
)
for
(
int
i
=
LOCAL_ID
;
i
<
numParticles
;
i
+=
LOCAL_SIZE
)
center
+=
trimTo3
(
posq
[
particles
[
i
]]);
center
.
x
=
reduceValue
(
center
.
x
,
temp
)
/
numParticles
;
center
.
y
=
reduceValue
(
center
.
y
,
temp
)
/
numParticles
;
...
...
@@ -42,7 +42,7 @@ extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __res
real
R
[
3
][
3
]
=
{{
0
,
0
,
0
},
{
0
,
0
,
0
},
{
0
,
0
,
0
}};
real
sum
=
0
;
for
(
int
i
=
threadIdx
.
x
;
i
<
numParticles
;
i
+=
blockDim
.
x
)
{
for
(
int
i
=
LOCAL_ID
;
i
<
numParticles
;
i
+=
LOCAL_SIZE
)
{
int
index
=
particles
[
i
];
real3
pos
=
trimTo3
(
posq
[
index
])
-
center
;
real3
refPos
=
trimTo3
(
referencePos
[
index
]);
...
...
@@ -64,7 +64,7 @@ extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __res
// Copy everything into the output buffer to send back to the host.
if
(
threadIdx
.
x
==
0
)
{
if
(
LOCAL_ID
==
0
)
{
for
(
int
i
=
0
;
i
<
3
;
i
++
)
for
(
int
j
=
0
;
j
<
3
;
j
++
)
buffer
[
3
*
i
+
j
]
=
R
[
i
][
j
];
...
...
@@ -78,11 +78,11 @@ extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __res
/**
* Apply forces based on the RMSD.
*/
extern
"C"
__global__
void
computeRMSDForces
(
int
numParticles
,
int
paddedNumAtoms
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
referencePos
,
const
int
*
__restrict__
particles
,
const
real
*
buffer
,
unsigned
long
long
*
__restrict__
forceBuffers
)
{
KERNEL
void
computeRMSDForces
(
int
numParticles
,
int
paddedNumAtoms
,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
const
real4
*
RESTRICT
referencePos
,
GLOBAL
const
int
*
RESTRICT
particles
,
GLOBAL
const
real
*
buffer
,
GLOBAL
mm_long
*
RESTRICT
forceBuffers
)
{
real3
center
=
make_real3
(
buffer
[
10
],
buffer
[
11
],
buffer
[
12
]);
real
scale
=
1
/
(
real
)
(
buffer
[
9
]
*
numParticles
);
for
(
int
i
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
i
<
numParticles
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
i
=
GLOBAL_ID
;
i
<
numParticles
;
i
+=
GLOBAL_SIZE
)
{
int
index
=
particles
[
i
];
real3
pos
=
trimTo3
(
posq
[
index
])
-
center
;
real3
refPos
=
trimTo3
(
referencePos
[
index
]);
...
...
@@ -90,8 +90,8 @@ extern "C" __global__ void computeRMSDForces(int numParticles, int paddedNumAtom
buffer
[
1
]
*
refPos
.
x
+
buffer
[
4
]
*
refPos
.
y
+
buffer
[
7
]
*
refPos
.
z
,
buffer
[
2
]
*
refPos
.
x
+
buffer
[
5
]
*
refPos
.
y
+
buffer
[
8
]
*
refPos
.
z
);
real3
force
=
(
rotatedRef
-
pos
)
*
scale
;
atomicAdd
(
&
forceBuffers
[
index
]
,
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)
))
;
atomicAdd
(
&
forceBuffers
[
index
+
paddedNumAtoms
]
,
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)
))
;
atomicAdd
(
&
forceBuffers
[
index
+
2
*
paddedNumAtoms
]
,
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)
))
;
forceBuffers
[
index
]
+=
(
mm_
long
)
(
force
.
x
*
0x100000000
);
forceBuffers
[
index
+
paddedNumAtoms
]
+=
(
mm_
long
)
(
force
.
y
*
0x100000000
);
forceBuffers
[
index
+
2
*
paddedNumAtoms
]
+=
(
mm_
long
)
(
force
.
z
*
0x100000000
);
}
}
platforms/common/src/kernels/torsionForce.cc
0 → 100644
View file @
5a06df78
const
real
PI
=
(
real
)
3.14159265358979323846
;
real3
v0
=
make_real3
(
pos1
.
x
-
pos2
.
x
,
pos1
.
y
-
pos2
.
y
,
pos1
.
z
-
pos2
.
z
);
real3
v1
=
make_real3
(
pos3
.
x
-
pos2
.
x
,
pos3
.
y
-
pos2
.
y
,
pos3
.
z
-
pos2
.
z
);
real3
v2
=
make_real3
(
pos3
.
x
-
pos4
.
x
,
pos3
.
y
-
pos4
.
y
,
pos3
.
z
-
pos4
.
z
);
#if APPLY_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
v0
)
APPLY_PERIODIC_TO_DELTA
(
v1
)
APPLY_PERIODIC_TO_DELTA
(
v2
)
#endif
real3
cp0
=
cross
(
v0
,
v1
);
real3
cp1
=
cross
(
v1
,
v2
);
real
cosangle
=
dot
(
normalize
(
cp0
),
normalize
(
cp1
));
real
theta
;
if
(
cosangle
>
0.99
f
||
cosangle
<
-
0.99
f
)
{
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
real3
cross_prod
=
cross
(
cp0
,
cp1
);
real
scale
=
dot
(
cp0
,
cp0
)
*
dot
(
cp1
,
cp1
);
theta
=
ASIN
(
SQRT
(
dot
(
cross_prod
,
cross_prod
)
/
scale
));
if
(
cosangle
<
0
)
theta
=
PI
-
theta
;
}
else
theta
=
ACOS
(
cosangle
);
theta
=
(
dot
(
v0
,
cp1
)
>=
0
?
theta
:
-
theta
);
COMPUTE_FORCE
real
normCross1
=
dot
(
cp0
,
cp0
);
real
normSqrBC
=
dot
(
v1
,
v1
);
real
normBC
=
SQRT
(
normSqrBC
);
real
normCross2
=
dot
(
cp1
,
cp1
);
real
dp
=
RECIP
(
normSqrBC
);
real4
ff
=
make_real4
((
-
dEdAngle
*
normBC
)
/
normCross1
,
dot
(
v0
,
v1
)
*
dp
,
dot
(
v2
,
v1
)
*
dp
,
(
dEdAngle
*
normBC
)
/
normCross2
);
real3
force1
=
ff
.
x
*
cp0
;
real3
force4
=
ff
.
w
*
cp1
;
real3
s
=
ff
.
y
*
force1
-
ff
.
z
*
force4
;
real3
force2
=
s
-
force1
;
real3
force3
=
-
s
-
force4
;
platforms/c
uda
/src/kernels/verlet.c
u
→
platforms/c
ommon
/src/kernels/verlet.c
c
View file @
5a06df78
...
...
@@ -2,13 +2,17 @@
* Perform the first step of Verlet integration.
*/
extern
"C"
__global__
void
integrateVerletPart1
(
int
numAtoms
,
int
paddedNumAtoms
,
const
mixed2
*
__restrict__
dt
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
posqCorrection
,
mixed4
*
__restrict__
velm
,
const
long
long
*
__restrict__
force
,
mixed4
*
__restrict__
posDelta
)
{
KERNEL
void
integrateVerletPart1
(
int
numAtoms
,
int
paddedNumAtoms
,
GLOBAL
const
mixed2
*
RESTRICT
dt
,
GLOBAL
const
real4
*
RESTRICT
posq
,
GLOBAL
mixed4
*
RESTRICT
velm
,
GLOBAL
const
mm_long
*
RESTRICT
force
,
GLOBAL
mixed4
*
RESTRICT
posDelta
#ifdef USE_MIXED_PRECISION
,
GLOBAL
const
real4
*
RESTRICT
posqCorrection
#endif
)
{
const
mixed2
stepSize
=
dt
[
0
];
const
mixed
dtPos
=
stepSize
.
y
;
const
mixed
dtVel
=
0.5
f
*
(
stepSize
.
x
+
stepSize
.
y
);
const
mixed
scale
=
dtVel
/
(
mixed
)
0x100000000
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numAtoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
GLOBAL_ID
;
index
<
numAtoms
;
index
+=
GLOBAL_SIZE
)
{
mixed4
velocity
=
velm
[
index
];
if
(
velocity
.
w
!=
0.0
)
{
#ifdef USE_MIXED_PRECISION
...
...
@@ -34,19 +38,24 @@ extern "C" __global__ void integrateVerletPart1(int numAtoms, int paddedNumAtoms
* Perform the second step of Verlet integration.
*/
extern
"C"
__global__
void
integrateVerletPart2
(
int
numAtoms
,
mixed2
*
__restrict__
dt
,
real4
*
__restrict__
posq
,
real4
*
__restrict__
posqCorrection
,
mixed4
*
__restrict__
velm
,
const
mixed4
*
__restrict__
posDelta
)
{
KERNEL
void
integrateVerletPart2
(
int
numAtoms
,
GLOBAL
mixed2
*
RESTRICT
dt
,
GLOBAL
real4
*
RESTRICT
posq
,
GLOBAL
mixed4
*
RESTRICT
velm
,
GLOBAL
const
mixed4
*
RESTRICT
posDelta
#ifdef USE_MIXED_PRECISION
,
GLOBAL
real4
*
RESTRICT
posqCorrection
#endif
)
{
mixed2
stepSize
=
dt
[
0
];
#if
__CUDA_ARCH__ >= 130
#if
def SUPPORTS_DOUBLE_PRECISION
double
oneOverDt
=
1.0
/
stepSize
.
y
;
#else
float
oneOverDt
=
1.0
f
/
stepSize
.
y
;
float
correction
=
(
1.0
f
-
oneOverDt
*
stepSize
.
y
)
/
stepSize
.
y
;
#endif
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
==
0
)
if
(
GLOBAL_ID
==
0
)
dt
[
0
].
x
=
stepSize
.
y
;
for
(;
index
<
numAtoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
SYNC_THREADS
;
int
index
=
GLOBAL_ID
;
for
(;
index
<
numAtoms
;
index
+=
GLOBAL_SIZE
)
{
mixed4
velocity
=
velm
[
index
];
if
(
velocity
.
w
!=
0.0
)
{
#ifdef USE_MIXED_PRECISION
...
...
@@ -60,7 +69,7 @@ extern "C" __global__ void integrateVerletPart2(int numAtoms, mixed2* __restrict
pos
.
x
+=
delta
.
x
;
pos
.
y
+=
delta
.
y
;
pos
.
z
+=
delta
.
z
;
#if
__CUDA_ARCH__ >= 130
#if
def SUPPORTS_DOUBLE_PRECISION
velocity
=
make_mixed4
((
mixed
)
(
delta
.
x
*
oneOverDt
),
(
mixed
)
(
delta
.
y
*
oneOverDt
),
(
mixed
)
(
delta
.
z
*
oneOverDt
),
velocity
.
w
);
#else
velocity
=
make_mixed4
((
mixed
)
(
delta
.
x
*
oneOverDt
+
delta
.
x
*
correction
),
(
mixed
)
(
delta
.
y
*
oneOverDt
+
delta
.
y
*
correction
),
(
mixed
)
(
delta
.
z
*
oneOverDt
+
delta
.
z
*
correction
),
velocity
.
w
);
...
...
@@ -80,28 +89,28 @@ extern "C" __global__ void integrateVerletPart2(int numAtoms, mixed2* __restrict
* Select the step size to use for the next step.
*/
extern
"C"
__global__
void
selectVerletStepSize
(
int
numAtoms
,
int
paddedNumAtoms
,
mixed
maxStepSize
,
mixed
errorTol
,
mixed2
*
__restrict__
dt
,
const
mixed4
*
__restrict__
velm
,
const
long
long
*
__restrict__
force
)
{
KERNEL
void
selectVerletStepSize
(
int
numAtoms
,
int
paddedNumAtoms
,
mixed
maxStepSize
,
mixed
errorTol
,
GLOBAL
mixed2
*
RESTRICT
dt
,
GLOBAL
const
mixed4
*
RESTRICT
velm
,
GLOBAL
const
mm_
long
*
RESTRICT
force
)
{
// Calculate the error.
extern
__shared__
mixed
error
[];
mixed
err
=
0
.0
f
;
LOCAL
mixed
error
[
256
];
mixed
err
=
0
;
const
mixed
scale
=
RECIP
((
mixed
)
0x100000000
);
for
(
int
index
=
threadIdx
.
x
;
index
<
numAtoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
LOCAL_ID
;
index
<
numAtoms
;
index
+=
LOCAL_SIZE
)
{
mixed3
f
=
make_mixed3
(
scale
*
force
[
index
],
scale
*
force
[
index
+
paddedNumAtoms
],
scale
*
force
[
index
+
paddedNumAtoms
*
2
]);
mixed
invMass
=
velm
[
index
].
w
;
err
+=
(
f
.
x
*
f
.
x
+
f
.
y
*
f
.
y
+
f
.
z
*
f
.
z
)
*
invMass
*
invMass
;
}
error
[
threadIdx
.
x
]
=
err
;
__syncthreads
()
;
error
[
LOCAL_ID
]
=
err
;
SYNC_THREADS
;
// Sum the errors from all threads.
for
(
unsigned
int
offset
=
1
;
offset
<
blockDim
.
x
;
offset
*=
2
)
{
if
(
threadIdx
.
x
+
offset
<
blockDim
.
x
&&
(
threadIdx
.
x
&
(
2
*
offset
-
1
))
==
0
)
error
[
threadIdx
.
x
]
+=
error
[
threadIdx
.
x
+
offset
];
__syncthreads
()
;
for
(
unsigned
int
offset
=
1
;
offset
<
LOCAL_SIZE
;
offset
*=
2
)
{
if
(
LOCAL_ID
+
offset
<
LOCAL_SIZE
&&
(
LOCAL_ID
&
(
2
*
offset
-
1
))
==
0
)
error
[
LOCAL_ID
]
+=
error
[
LOCAL_ID
+
offset
];
SYNC_THREADS
;
}
if
(
threadIdx
.
x
==
0
)
{
if
(
LOCAL_ID
==
0
)
{
mixed
totalError
=
SQRT
(
error
[
0
]
/
(
numAtoms
*
3
));
mixed
newStepSize
=
SQRT
(
errorTol
/
totalError
);
mixed
oldStepSize
=
dt
[
0
].
y
;
...
...
platforms/cpu/include/CpuKernels.h
View file @
5a06df78
...
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013-20
19
Stanford University and the Authors. *
* Portions copyright (c) 2013-20
20
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -32,7 +32,6 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CpuBAOABDynamics.h"
#include "CpuBondForce.h"
#include "CpuCustomGBForce.h"
#include "CpuCustomManyParticleForce.h"
...
...
@@ -40,6 +39,7 @@
#include "CpuGayBerneForce.h"
#include "CpuGBSAOBCForce.h"
#include "CpuLangevinDynamics.h"
#include "CpuLangevinMiddleDynamics.h"
#include "CpuNeighborList.h"
#include "CpuNonbondedForce.h"
#include "CpuPlatform.h"
...
...
@@ -274,7 +274,7 @@ private:
std
::
vector
<
std
::
vector
<
double
>
>
bonded14ParamArray
;
double
nonbondedCutoff
,
switchingDistance
,
rfDielectric
,
ewaldAlpha
,
ewaldDispersionAlpha
,
ewaldSelfEnergy
,
dispersionCoefficient
;
int
kmax
[
3
],
gridSize
[
3
],
dispersionGridSize
[
3
];
bool
useSwitchingFunction
,
useOptimizedPme
,
hasInitializedPme
,
hasInitializedDispersionPme
,
hasParticleOffsets
,
hasExceptionOffsets
;
bool
useSwitchingFunction
,
exceptionsArePeriodic
,
useOptimizedPme
,
hasInitializedPme
,
hasInitializedDispersionPme
,
hasParticleOffsets
,
hasExceptionOffsets
;
std
::
vector
<
std
::
set
<
int
>
>
exclusions
;
std
::
vector
<
std
::
pair
<
float
,
float
>
>
particleParams
;
std
::
vector
<
float
>
C6params
;
...
...
@@ -538,42 +538,38 @@ private:
};
/**
* This kernel is invoked by
BAOAB
LangevinIntegrator to take one time step.
* This kernel is invoked by Langevin
Middle
Integrator to take one time step.
*/
class
CpuIntegrate
BAOAB
StepKernel
:
public
Integrate
BAOAB
StepKernel
{
class
CpuIntegrate
LangevinMiddle
StepKernel
:
public
Integrate
LangevinMiddle
StepKernel
{
public:
CpuIntegrate
BAOAB
StepKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CpuPlatform
::
PlatformData
&
data
)
:
Integrate
BAOAB
StepKernel
(
name
,
platform
),
CpuIntegrate
LangevinMiddle
StepKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CpuPlatform
::
PlatformData
&
data
)
:
Integrate
LangevinMiddle
StepKernel
(
name
,
platform
),
data
(
data
),
dynamics
(
0
)
{
}
~
CpuIntegrate
BAOAB
StepKernel
();
~
CpuIntegrate
LangevinMiddle
StepKernel
();
/**
* Initialize the kernel, setting up the particle masses.
*
* @param system the System this kernel will be applied to
* @param integrator the
BAOAB
LangevinIntegrator this kernel will be used for
* @param integrator the Langevin
Middle
Integrator this kernel will be used for
*/
void
initialize
(
const
System
&
system
,
const
BAOAB
LangevinIntegrator
&
integrator
);
void
initialize
(
const
System
&
system
,
const
Langevin
Middle
Integrator
&
integrator
);
/**
* Execute the kernel.
*
* @param context the context in which to execute this kernel
* @param integrator the BAOABLangevinIntegrator this kernel is being used for
* @param forcesAreValid if the context has been modified since the last time step, this will be
* false to show that cached forces are invalid and must be recalculated.
* On exit, this should specify whether the cached forces are valid at the
* end of the step.
* @param integrator the LangevinMiddleIntegrator this kernel is being used for
*/
void
execute
(
ContextImpl
&
context
,
const
BAOAB
LangevinIntegrator
&
integrator
,
bool
&
forcesAreValid
);
void
execute
(
ContextImpl
&
context
,
const
Langevin
Middle
Integrator
&
integrator
);
/**
* Compute the kinetic energy.
*
* @param context the context in which to execute this kernel
* @param integrator the
BAOAB
LangevinIntegrator this kernel is being used for
* @param integrator the Langevin
Middle
Integrator this kernel is being used for
*/
double
computeKineticEnergy
(
ContextImpl
&
context
,
const
BAOAB
LangevinIntegrator
&
integrator
);
double
computeKineticEnergy
(
ContextImpl
&
context
,
const
Langevin
Middle
Integrator
&
integrator
);
private:
CpuPlatform
::
PlatformData
&
data
;
Cpu
BAOAB
Dynamics
*
dynamics
;
Cpu
LangevinMiddle
Dynamics
*
dynamics
;
std
::
vector
<
double
>
masses
;
double
prevTemp
,
prevFriction
,
prevStepSize
;
};
...
...
platforms/cpu/include/Cpu
BAOAB
Dynamics.h
→
platforms/cpu/include/Cpu
LangevinMiddle
Dynamics.h
View file @
5a06df78
/* Portions copyright (c) 2013-20
19
Stanford University and Simbios.
/* Portions copyright (c) 2013-20
20
Stanford University and Simbios.
* Authors: Peter Eastman
* Contributors:
*
...
...
@@ -23,17 +23,17 @@
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef __CPU_
BAOAB
_DYNAMICS_H__
#define __CPU_
BAOAB
_DYNAMICS_H__
#ifndef __CPU_
LANGEVIN_MIDDLE
_DYNAMICS_H__
#define __CPU_
LANGEVIN_MIDDLE
_DYNAMICS_H__
#include "Reference
BAOAB
Dynamics.h"
#include "Reference
LangevinMiddle
Dynamics.h"
#include "CpuRandom.h"
#include "openmm/internal/ThreadPool.h"
#include "sfmt/SFMT.h"
namespace
OpenMM
{
class
Cpu
BAOAB
Dynamics
:
public
Reference
BAOAB
Dynamics
{
class
Cpu
LangevinMiddle
Dynamics
:
public
Reference
LangevinMiddle
Dynamics
{
public:
/**
* Constructor.
...
...
@@ -45,25 +45,22 @@ public:
* @param threads thread pool for parallelizing computation
* @param random random number generator
*/
Cpu
BAOAB
Dynamics
(
int
numberOfAtoms
,
double
deltaT
,
double
friction
,
double
temperature
,
OpenMM
::
ThreadPool
&
threads
,
OpenMM
::
CpuRandom
&
random
);
Cpu
LangevinMiddle
Dynamics
(
int
numberOfAtoms
,
double
deltaT
,
double
friction
,
double
temperature
,
OpenMM
::
ThreadPool
&
threads
,
OpenMM
::
CpuRandom
&
random
);
/**
* Destructor.
*/
~
Cpu
BAOAB
Dynamics
();
~
Cpu
LangevinMiddle
Dynamics
();
/**
* First update step.
*
* @param numberOfAtoms number of atoms
* @param atomCoordinates atom coordinates
* @param velocities velocities
* @param forces forces
* @param inverseMasses inverse atom masses
* @param xPrime xPrime
*/
void
updatePart1
(
int
numberOfAtoms
,
std
::
vector
<
OpenMM
::
Vec3
>&
atomCoordinates
,
std
::
vector
<
OpenMM
::
Vec3
>&
velocities
,
std
::
vector
<
OpenMM
::
Vec3
>&
forces
,
std
::
vector
<
double
>&
inverseMasses
,
std
::
vector
<
OpenMM
::
Vec3
>&
xPrime
);
void
updatePart1
(
int
numberOfAtoms
,
std
::
vector
<
OpenMM
::
Vec3
>&
velocities
,
std
::
vector
<
OpenMM
::
Vec3
>&
forces
,
std
::
vector
<
double
>&
inverseMasses
);
/**
* Second update step.
...
...
@@ -94,7 +91,6 @@ private:
void
threadUpdate1
(
int
threadIndex
);
void
threadUpdate2
(
int
threadIndex
);
void
threadUpdate3
(
int
threadIndex
);
void
threadUpdate4
(
int
threadIndex
);
OpenMM
::
ThreadPool
&
threads
;
OpenMM
::
CpuRandom
&
random
;
std
::
vector
<
OpenMM_SFMT
::
SFMT
>
threadRandom
;
...
...
@@ -109,4 +105,4 @@ private:
}
// namespace OpenMM
#endif // __CPU_
BAOAB
_DYNAMICS_H__
#endif // __CPU_
LANGEVIN_MIDDLE
_DYNAMICS_H__
Prev
1
2
3
4
5
6
7
8
9
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment