Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
5a06df78
Commit
5a06df78
authored
Mar 04, 2020
by
tic20
Browse files
Merge
https://github.com/openmm/openmm
parents
8dd60914
a9223eea
Changes
335
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
615 additions
and
2568 deletions
+615
-2568
platforms/cuda/src/kernels/customCentroidBond.cu
platforms/cuda/src/kernels/customCentroidBond.cu
+0
-141
platforms/cuda/src/kernels/customGBEnergyN2.cu
platforms/cuda/src/kernels/customGBEnergyN2.cu
+0
-366
platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
+0
-22
platforms/cuda/src/kernels/customGBValueN2.cu
platforms/cuda/src/kernels/customGBValueN2.cu
+0
-313
platforms/cuda/src/kernels/customGBValuePerParticle.cu
platforms/cuda/src/kernels/customGBValuePerParticle.cu
+0
-18
platforms/cuda/src/kernels/customHbondForce.cu
platforms/cuda/src/kernels/customHbondForce.cu
+0
-234
platforms/cuda/src/kernels/customIntegratorPerDof.cu
platforms/cuda/src/kernels/customIntegratorPerDof.cu
+0
-62
platforms/cuda/src/kernels/customManyParticle.cu
platforms/cuda/src/kernels/customManyParticle.cu
+0
-337
platforms/cuda/src/kernels/customNonbondedGroups.cu
platforms/cuda/src/kernels/customNonbondedGroups.cu
+0
-182
platforms/cuda/src/kernels/gbsaObc1.cu
platforms/cuda/src/kernels/gbsaObc1.cu
+0
-769
platforms/cuda/src/kernels/nonbondedExceptions.cu
platforms/cuda/src/kernels/nonbondedExceptions.cu
+3
-0
platforms/cuda/src/kernels/noseHooverChain.cu
platforms/cuda/src/kernels/noseHooverChain.cu
+172
-0
platforms/cuda/src/kernels/removeCM.cu
platforms/cuda/src/kernels/removeCM.cu
+0
-113
platforms/cuda/src/kernels/utilities.cu
platforms/cuda/src/kernels/utilities.cu
+1
-1
platforms/cuda/src/kernels/vectorOps.cu
platforms/cuda/src/kernels/vectorOps.cu
+12
-4
platforms/cuda/src/kernels/velocityVerlet.cu
platforms/cuda/src/kernels/velocityVerlet.cu
+349
-0
platforms/cuda/staticTarget/CMakeLists.txt
platforms/cuda/staticTarget/CMakeLists.txt
+5
-5
platforms/cuda/tests/TestCudaLangevinMiddleIntegrator.cpp
platforms/cuda/tests/TestCudaLangevinMiddleIntegrator.cpp
+1
-1
platforms/cuda/tests/TestCudaNoseHooverIntegrator.cpp
platforms/cuda/tests/TestCudaNoseHooverIntegrator.cpp
+36
-0
platforms/cuda/tests/TestCudaNoseHooverThermostat.cpp
platforms/cuda/tests/TestCudaNoseHooverThermostat.cpp
+36
-0
No files found.
platforms/cuda/src/kernels/customCentroidBond.cu
deleted
100644 → 0
View file @
8dd60914
/**
* Compute the center of each group.
*/
extern
"C"
__global__
void
computeGroupCenters
(
const
real4
*
__restrict__
posq
,
const
int
*
__restrict__
groupParticles
,
const
real
*
__restrict__
groupWeights
,
const
int
*
__restrict__
groupOffsets
,
real4
*
__restrict__
centerPositions
)
{
__shared__
volatile
real3
temp
[
64
];
for
(
int
group
=
blockIdx
.
x
;
group
<
NUM_GROUPS
;
group
+=
gridDim
.
x
)
{
// The threads in this block work together to compute the center one group.
int
firstIndex
=
groupOffsets
[
group
];
int
lastIndex
=
groupOffsets
[
group
+
1
];
real3
center
=
make_real3
(
0
,
0
,
0
);
for
(
int
index
=
threadIdx
.
x
;
index
<
lastIndex
-
firstIndex
;
index
+=
blockDim
.
x
)
{
int
atom
=
groupParticles
[
firstIndex
+
index
];
real
weight
=
groupWeights
[
firstIndex
+
index
];
real4
pos
=
posq
[
atom
];
center
.
x
+=
weight
*
pos
.
x
;
center
.
y
+=
weight
*
pos
.
y
;
center
.
z
+=
weight
*
pos
.
z
;
}
// Sum the values.
int
thread
=
threadIdx
.
x
;
temp
[
thread
].
x
=
center
.
x
;
temp
[
thread
].
y
=
center
.
y
;
temp
[
thread
].
z
=
center
.
z
;
__syncthreads
();
if
(
thread
<
32
)
{
temp
[
thread
].
x
+=
temp
[
thread
+
32
].
x
;
temp
[
thread
].
y
+=
temp
[
thread
+
32
].
y
;
temp
[
thread
].
z
+=
temp
[
thread
+
32
].
z
;
if
(
thread
<
16
)
{
temp
[
thread
].
x
+=
temp
[
thread
+
16
].
x
;
temp
[
thread
].
y
+=
temp
[
thread
+
16
].
y
;
temp
[
thread
].
z
+=
temp
[
thread
+
16
].
z
;
}
if
(
thread
<
8
)
{
temp
[
thread
].
x
+=
temp
[
thread
+
8
].
x
;
temp
[
thread
].
y
+=
temp
[
thread
+
8
].
y
;
temp
[
thread
].
z
+=
temp
[
thread
+
8
].
z
;
}
if
(
thread
<
4
)
{
temp
[
thread
].
x
+=
temp
[
thread
+
4
].
x
;
temp
[
thread
].
y
+=
temp
[
thread
+
4
].
y
;
temp
[
thread
].
z
+=
temp
[
thread
+
4
].
z
;
}
if
(
thread
<
2
)
{
temp
[
thread
].
x
+=
temp
[
thread
+
2
].
x
;
temp
[
thread
].
y
+=
temp
[
thread
+
2
].
y
;
temp
[
thread
].
z
+=
temp
[
thread
+
2
].
z
;
}
}
if
(
thread
==
0
)
centerPositions
[
group
]
=
make_real4
(
temp
[
0
].
x
+
temp
[
1
].
x
,
temp
[
0
].
y
+
temp
[
1
].
y
,
temp
[
0
].
z
+
temp
[
1
].
z
,
0
);
}
}
/**
* Convert a real4 to a real3 by removing its last element.
*/
inline
__device__
real3
trim
(
real4
v
)
{
return
make_real3
(
v
.
x
,
v
.
y
,
v
.
z
);
}
/**
* Compute the difference between two vectors, setting the fourth component to the squared magnitude.
*/
inline
__device__
real4
delta
(
real4
vec1
,
real4
vec2
,
bool
periodic
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
)
{
real4
result
=
make_real4
(
vec1
.
x
-
vec2
.
x
,
vec1
.
y
-
vec2
.
y
,
vec1
.
z
-
vec2
.
z
,
0
);
if
(
periodic
)
APPLY_PERIODIC_TO_DELTA
(
result
);
result
.
w
=
result
.
x
*
result
.
x
+
result
.
y
*
result
.
y
+
result
.
z
*
result
.
z
;
return
result
;
}
/**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/
__device__
real
computeAngle
(
real4
vec1
,
real4
vec2
)
{
real
dotProduct
=
vec1
.
x
*
vec2
.
x
+
vec1
.
y
*
vec2
.
y
+
vec1
.
z
*
vec2
.
z
;
real
cosine
=
dotProduct
*
RSQRT
(
vec1
.
w
*
vec2
.
w
);
real
angle
;
if
(
cosine
>
0.99
f
||
cosine
<
-
0.99
f
)
{
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
real3
crossProduct
=
cross
(
vec1
,
vec2
);
real
scale
=
vec1
.
w
*
vec2
.
w
;
angle
=
ASIN
(
SQRT
(
dot
(
crossProduct
,
crossProduct
)
/
scale
));
if
(
cosine
<
0.0
f
)
angle
=
M_PI
-
angle
;
}
else
angle
=
ACOS
(
cosine
);
return
angle
;
}
/**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/
inline
__device__
real4
computeCross
(
real4
vec1
,
real4
vec2
)
{
real3
cp
=
cross
(
vec1
,
vec2
);
return
make_real4
(
cp
.
x
,
cp
.
y
,
cp
.
z
,
cp
.
x
*
cp
.
x
+
cp
.
y
*
cp
.
y
+
cp
.
z
*
cp
.
z
);
}
/**
* Compute the forces on groups based on the bonds.
*/
extern
"C"
__global__
void
computeGroupForces
(
unsigned
long
long
*
__restrict__
groupForce
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
centerPositions
,
const
int
*
__restrict__
bondGroups
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
EXTRA_ARGS
)
{
mixed
energy
=
0
;
INIT_PARAM_DERIVS
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_BONDS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
COMPUTE_FORCE
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
SAVE_PARAM_DERIVS
}
/**
* Apply the forces from the group centers to the individual atoms.
*/
extern
"C"
__global__
void
applyForcesToAtoms
(
const
int
*
__restrict__
groupParticles
,
const
real
*
__restrict__
groupWeights
,
const
int
*
__restrict__
groupOffsets
,
const
long
long
*
__restrict__
groupForce
,
unsigned
long
long
*
__restrict__
atomForce
)
{
for
(
int
group
=
blockIdx
.
x
;
group
<
NUM_GROUPS
;
group
+=
gridDim
.
x
)
{
long
long
fx
=
groupForce
[
group
];
long
long
fy
=
groupForce
[
group
+
NUM_GROUPS
];
long
long
fz
=
groupForce
[
group
+
NUM_GROUPS
*
2
];
int
firstIndex
=
groupOffsets
[
group
];
int
lastIndex
=
groupOffsets
[
group
+
1
];
for
(
int
index
=
threadIdx
.
x
;
index
<
lastIndex
-
firstIndex
;
index
+=
blockDim
.
x
)
{
int
atom
=
groupParticles
[
firstIndex
+
index
];
real
weight
=
groupWeights
[
firstIndex
+
index
];
atomicAdd
(
&
atomForce
[
atom
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
fx
*
weight
)));
atomicAdd
(
&
atomForce
[
atom
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
fy
*
weight
)));
atomicAdd
(
&
atomForce
[
atom
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
fz
*
weight
)));
}
}
}
platforms/cuda/src/kernels/customGBEnergyN2.cu
deleted
100644 → 0
View file @
8dd60914
#define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000)));
typedef
struct
{
real3
pos
;
real3
force
;
ATOM_PARAMETER_DATA
#ifdef NEED_PADDING
float
padding
;
#endif
}
AtomData
;
/**
* Compute a force based on pair interactions.
*/
extern
"C"
__global__
void
computeN2Energy
(
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
ushort2
*
__restrict__
exclusionTiles
,
bool
needEnergy
,
#ifdef USE_CUTOFF
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
#else
unsigned
int
numTiles
#endif
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
mixed
energy
=
0
;
INIT_PARAM_DERIVS
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real3
force
=
make_real3
(
0
);
DECLARE_ATOM1_DERIVATIVES
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
pos1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
exclusions
[
pos
*
TILE_SIZE
+
tgx
];
#endif
if
(
x
==
y
)
{
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
pos
=
make_real3
(
pos1
.
x
,
pos1
.
y
,
pos1
.
z
);
LOAD_LOCAL_PARAMETERS_FROM_1
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
j
;
real3
pos2
=
localData
[
atom2
].
pos
;
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
j
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
const
real
interactionScale
=
0.5
f
;
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
if
(
needEnergy
)
energy
+=
0.5
f
*
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
}
}
else
{
// This is an off-diagonal tile.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq
[
j
];
localData
[
localAtomIndex
].
pos
=
make_real3
(
tempPosq
.
x
,
tempPosq
.
y
,
tempPosq
.
z
);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
force
=
make_real3
(
0
);
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_EXCLUSIONS
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
#endif
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real3
pos2
=
localData
[
atom2
].
pos
;
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
tj
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
const
real
interactionScale
=
1
;
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
atom2
=
tbx
+
tj
;
localData
[
atom2
].
force
.
x
+=
delta
.
x
;
localData
[
atom2
].
force
.
y
+=
delta
.
y
;
localData
[
atom2
].
force
.
z
+=
delta
.
z
;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
STORE_DERIVATIVES_1
if
(
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
STORE_DERIVATIVES_2
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
if
(
numTiles
>
maxTiles
)
return
;
// There wasn't enough memory for the neighbor list.
int
pos
=
(
int
)
(
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
long
long
)
NUM_BLOCKS
+
1
)
/
2
:
(
long
)
numTiles
)
/
totalWarps
);
int
end
=
(
int
)
((
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
long
long
)
NUM_BLOCKS
+
1
)
/
2
:
(
long
)
numTiles
)
/
totalWarps
);
#else
int
pos
=
(
int
)
(
warp
*
(
long
long
)
numTiles
/
totalWarps
);
int
end
=
(
int
)
((
warp
+
1
)
*
(
long
long
)
numTiles
/
totalWarps
);
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
volatile
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
while
(
pos
<
end
)
{
const
bool
isExcluded
=
false
;
real3
force
=
make_real3
(
0
);
DECLARE_ATOM1_DERIVATIVES
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
int
x
,
y
;
bool
singlePeriodicCopy
=
false
;
#ifdef USE_CUTOFF
x
=
tiles
[
pos
];
real4
blockSizeX
=
blockSize
[
x
];
singlePeriodicCopy
=
(
0.5
f
*
periodicBoxSize
.
x
-
blockSizeX
.
x
>=
CUTOFF
&&
0.5
f
*
periodicBoxSize
.
y
-
blockSizeX
.
y
>=
CUTOFF
&&
0.5
f
*
periodicBoxSize
.
z
-
blockSizeX
.
z
>=
CUTOFF
);
#else
y
=
(
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
#endif
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
real4
pos1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
#ifdef USE_CUTOFF
unsigned
int
j
=
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
];
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
real4
tempPosq
=
posq
[
j
];
localData
[
localAtomIndex
].
pos
=
make_real3
(
tempPosq
.
x
,
tempPosq
.
y
,
tempPosq
.
z
);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
force
=
make_real3
(
0
);
CLEAR_LOCAL_DERIVATIVES
}
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos1
,
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
localData
[
threadIdx
.
x
].
pos
,
blockCenterX
)
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real3
pos2
=
localData
[
atom2
].
pos
;
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices
[
tbx
+
tj
];
real
dEdR
=
0
;
real
tempEnergy
=
0
;
const
real
interactionScale
=
1
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
atom2
=
tbx
+
tj
;
localData
[
atom2
].
force
.
x
+=
delta
.
x
;
localData
[
atom2
].
force
.
y
+=
delta
.
y
;
localData
[
atom2
].
force
.
z
+=
delta
.
z
;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real3
pos2
=
localData
[
atom2
].
pos
;
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices
[
tbx
+
tj
];
real
dEdR
=
0
;
real
tempEnergy
=
0
;
const
real
interactionScale
=
1
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
atom2
=
tbx
+
tj
;
localData
[
atom2
].
force
.
x
+=
delta
.
x
;
localData
[
atom2
].
force
.
y
+=
delta
.
y
;
localData
[
atom2
].
force
.
z
+=
delta
.
z
;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
unsigned
int
offset
=
atom1
;
STORE_DERIVATIVES_1
#ifdef USE_CUTOFF
unsigned
int
atom2
=
atomIndices
[
threadIdx
.
x
];
#else
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
offset
=
atom2
;
STORE_DERIVATIVES_2
}
}
pos
++
;
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
SAVE_PARAM_DERIVS
}
platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
deleted
100644 → 0
View file @
8dd60914
/**
* Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
*/
extern
"C"
__global__
void
computePerParticleEnergy
(
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
PARAMETER_ARGUMENTS
)
{
mixed
energy
=
0
;
INIT_PARAM_DERIVS
for
(
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Load the derivatives
LOAD_DERIVATIVES
// Now calculate the per-particle energy terms.
real4
pos
=
posq
[
index
];
real3
force
=
make_real3
(
0
,
0
,
0
);
COMPUTE_ENERGY
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
SAVE_PARAM_DERIVS
}
platforms/cuda/src/kernels/customGBValueN2.cu
deleted
100644 → 0
View file @
8dd60914
typedef
struct
{
real3
pos
;
real
value
;
ATOM_PARAMETER_DATA
#ifdef NEED_PADDING
float
padding
;
#endif
}
AtomData
;
/**
* Compute a value based on pair interactions.
*/
extern
"C"
__global__
void
computeN2Value
(
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
ushort2
*
__restrict__
exclusionTiles
,
unsigned
long
long
*
__restrict__
global_value
,
#ifdef USE_CUTOFF
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
#else
unsigned
int
numTiles
#endif
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real
value
=
0
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
pos1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
exclusions
[
pos
*
TILE_SIZE
+
tgx
];
#endif
if
(
x
==
y
)
{
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
pos
=
make_real3
(
pos1
.
x
,
pos1
.
y
,
pos1
.
z
);
LOAD_LOCAL_PARAMETERS_FROM_1
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
j
;
real3
pos2
=
localData
[
atom2
].
pos
;
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
j
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
||
atom2
>=
NUM_ATOMS
||
!
(
excl
&
0x1
));
if
(
!
isExcluded
&&
atom1
!=
atom2
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
#endif
COMPUTE_VALUE
}
value
+=
tempValue1
;
ADD_TEMP_DERIVS1
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
}
}
else
{
// This is an off-diagonal tile.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq
[
j
];
localData
[
localAtomIndex
].
pos
=
make_real3
(
tempPosq
.
x
,
tempPosq
.
y
,
tempPosq
.
z
);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
value
=
0
;
#ifdef USE_EXCLUSIONS
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
#endif
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real3
pos2
=
localData
[
atom2
].
pos
;
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
tj
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
||
atom2
>=
NUM_ATOMS
||
!
(
excl
&
0x1
));
if
(
!
isExcluded
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#endif
COMPUTE_VALUE
}
value
+=
tempValue1
;
localData
[
tbx
+
tj
].
value
+=
tempValue2
;
ADD_TEMP_DERIVS1
ADD_TEMP_DERIVS2
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
unsigned
int
offset1
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_value
[
offset1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
*
0x100000000
)));
STORE_PARAM_DERIVS1
if
(
x
!=
y
)
{
unsigned
int
offset2
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_value
[
offset2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
value
*
0x100000000
)));
STORE_PARAM_DERIVS2
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
if
(
numTiles
>
maxTiles
)
return
;
// There wasn't enough memory for the neighbor list.
int
pos
=
(
int
)
(
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
long
long
)
NUM_BLOCKS
+
1
)
/
2
:
(
long
)
numTiles
)
/
totalWarps
);
int
end
=
(
int
)
((
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
long
long
)
NUM_BLOCKS
+
1
)
/
2
:
(
long
)
numTiles
)
/
totalWarps
);
#else
int
pos
=
(
int
)
(
warp
*
(
long
long
)
numTiles
/
totalWarps
);
int
end
=
(
int
)
((
warp
+
1
)
*
(
long
long
)
numTiles
/
totalWarps
);
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
volatile
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
while
(
pos
<
end
)
{
real
value
=
0
;
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
int
x
,
y
;
bool
singlePeriodicCopy
=
false
;
#ifdef USE_CUTOFF
x
=
tiles
[
pos
];
real4
blockSizeX
=
blockSize
[
x
];
singlePeriodicCopy
=
(
0.5
f
*
periodicBoxSize
.
x
-
blockSizeX
.
x
>=
CUTOFF
&&
0.5
f
*
periodicBoxSize
.
y
-
blockSizeX
.
y
>=
CUTOFF
&&
0.5
f
*
periodicBoxSize
.
z
-
blockSizeX
.
z
>=
CUTOFF
);
#else
y
=
(
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
#endif
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
real4
pos1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
#ifdef USE_CUTOFF
unsigned
int
j
=
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
];
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
real4
tempPosq
=
posq
[
j
];
localData
[
localAtomIndex
].
pos
=
make_real3
(
tempPosq
.
x
,
tempPosq
.
y
,
tempPosq
.
z
);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
value
=
0
;
}
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos1
,
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
localData
[
threadIdx
.
x
].
pos
,
blockCenterX
)
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real3
pos2
=
localData
[
atom2
].
pos
;
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices
[
tbx
+
tj
];
real
tempValue1
=
0
;
real
tempValue2
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_VALUE
}
value
+=
tempValue1
;
localData
[
tbx
+
tj
].
value
+=
tempValue2
;
ADD_TEMP_DERIVS1
ADD_TEMP_DERIVS2
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real3
pos2
=
localData
[
atom2
].
pos
;
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices
[
tbx
+
tj
];
real
tempValue1
=
0
;
real
tempValue2
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_VALUE
}
value
+=
tempValue1
;
localData
[
tbx
+
tj
].
value
+=
tempValue2
;
ADD_TEMP_DERIVS1
ADD_TEMP_DERIVS2
#ifdef USE_CUTOFF
}
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
unsigned
int
offset1
=
atom1
;
atomicAdd
(
&
global_value
[
offset1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
*
0x100000000
)));
STORE_PARAM_DERIVS1
#ifdef USE_CUTOFF
unsigned
int
atom2
=
atomIndices
[
threadIdx
.
x
];
#else
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
unsigned
int
offset2
=
atom2
;
atomicAdd
(
&
global_value
[
offset2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
value
*
0x100000000
)));
STORE_PARAM_DERIVS2
}
}
pos
++
;
}
}
platforms/cuda/src/kernels/customGBValuePerParticle.cu
deleted
100644 → 0
View file @
8dd60914
/**
* Reduce a pairwise computed value, and compute per-particle values.
*/
extern
"C"
__global__
void
computePerParticleValues
(
real4
*
posq
,
long
long
*
valueBuffers
PARAMETER_ARGUMENTS
)
{
for
(
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Load the pairwise value
real
sum
=
valueBuffers
[
index
]
/
(
real
)
0x100000000
;
REDUCE_PARAM0_DERIV
// Now calculate other values
real4
pos
=
posq
[
index
];
COMPUTE_VALUES
}
}
platforms/cuda/src/kernels/customHbondForce.cu
deleted
100644 → 0
View file @
8dd60914
/**
* Convert a real4 to a real3 by removing its last element.
*/
inline
__device__
real3
trim
(
real4
v
)
{
return
make_real3
(
v
.
x
,
v
.
y
,
v
.
z
);
}
/**
* This does nothing, and just exists to simplify the code generation.
*/
inline
__device__
real3
trim
(
real3
v
)
{
return
v
;
}
/**
* Compute the difference between two vectors, optionally taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude.
*/
inline
__device__
real4
delta
(
real4
vec1
,
real4
vec2
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
)
{
real4
result
=
make_real4
(
vec1
.
x
-
vec2
.
x
,
vec1
.
y
-
vec2
.
y
,
vec1
.
z
-
vec2
.
z
,
0.0
f
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
result
)
#endif
result
.
w
=
result
.
x
*
result
.
x
+
result
.
y
*
result
.
y
+
result
.
z
*
result
.
z
;
return
result
;
}
/**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/
inline
__device__
real
computeAngle
(
real4
vec1
,
real4
vec2
)
{
real
dotProduct
=
vec1
.
x
*
vec2
.
x
+
vec1
.
y
*
vec2
.
y
+
vec1
.
z
*
vec2
.
z
;
real
cosine
=
dotProduct
*
RSQRT
(
vec1
.
w
*
vec2
.
w
);
real
angle
;
if
(
cosine
>
0.99
f
||
cosine
<
-
0.99
f
)
{
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
real3
crossProduct
=
cross
(
vec1
,
vec2
);
real
scale
=
vec1
.
w
*
vec2
.
w
;
angle
=
ASIN
(
SQRT
(
dot
(
crossProduct
,
crossProduct
)
/
scale
));
if
(
cosine
<
0.0
f
)
angle
=
M_PI
-
angle
;
}
else
angle
=
ACOS
(
cosine
);
return
angle
;
}
/**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/
inline
__device__
real4
computeCross
(
real4
vec1
,
real4
vec2
)
{
real3
result
=
cross
(
vec1
,
vec2
);
return
make_real4
(
result
.
x
,
result
.
y
,
result
.
z
,
result
.
x
*
result
.
x
+
result
.
y
*
result
.
y
+
result
.
z
*
result
.
z
);
}
/**
* Compute forces on donors.
*/
extern
"C"
__global__
void
computeDonorForces
(
unsigned
long
long
*
__restrict__
force
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
exclusions
,
const
int4
*
__restrict__
donorAtoms
,
const
int4
*
__restrict__
acceptorAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
PARAMETER_ARGUMENTS
)
{
extern
__shared__
real4
posBuffer
[];
mixed
energy
=
0
;
real3
f1
=
make_real3
(
0
);
real3
f2
=
make_real3
(
0
);
real3
f3
=
make_real3
(
0
);
for
(
int
donorStart
=
0
;
donorStart
<
NUM_DONORS
;
donorStart
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Load information about the donor this thread will compute forces on.
int
donorIndex
=
donorStart
+
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int4
atoms
,
exclusionIndices
;
real4
d1
,
d2
,
d3
;
if
(
donorIndex
<
NUM_DONORS
)
{
atoms
=
donorAtoms
[
donorIndex
];
d1
=
(
atoms
.
x
>
-
1
?
posq
[
atoms
.
x
]
:
make_real4
(
0
));
d2
=
(
atoms
.
y
>
-
1
?
posq
[
atoms
.
y
]
:
make_real4
(
0
));
d3
=
(
atoms
.
z
>
-
1
?
posq
[
atoms
.
z
]
:
make_real4
(
0
));
#ifdef USE_EXCLUSIONS
exclusionIndices
=
exclusions
[
donorIndex
];
#endif
}
else
atoms
=
make_int4
(
-
1
,
-
1
,
-
1
,
-
1
);
for
(
int
acceptorStart
=
0
;
acceptorStart
<
NUM_ACCEPTORS
;
acceptorStart
+=
blockDim
.
x
)
{
// Load the next block of acceptors into local memory.
__syncthreads
();
int
blockSize
=
min
((
int
)
blockDim
.
x
,
NUM_ACCEPTORS
-
acceptorStart
);
if
(
threadIdx
.
x
<
blockSize
)
{
int4
atoms2
=
acceptorAtoms
[
acceptorStart
+
threadIdx
.
x
];
posBuffer
[
3
*
threadIdx
.
x
]
=
(
atoms2
.
x
>
-
1
?
posq
[
atoms2
.
x
]
:
make_real4
(
0
));
posBuffer
[
3
*
threadIdx
.
x
+
1
]
=
(
atoms2
.
y
>
-
1
?
posq
[
atoms2
.
y
]
:
make_real4
(
0
));
posBuffer
[
3
*
threadIdx
.
x
+
2
]
=
(
atoms2
.
z
>
-
1
?
posq
[
atoms2
.
z
]
:
make_real4
(
0
));
}
__syncthreads
();
if
(
donorIndex
<
NUM_DONORS
)
{
for
(
int
index
=
0
;
index
<
blockSize
;
index
++
)
{
int
acceptorIndex
=
acceptorStart
+
index
;
#ifdef USE_EXCLUSIONS
if
(
acceptorIndex
==
exclusionIndices
.
x
||
acceptorIndex
==
exclusionIndices
.
y
||
acceptorIndex
==
exclusionIndices
.
z
||
acceptorIndex
==
exclusionIndices
.
w
)
continue
;
#endif
// Compute the interaction between a donor and an acceptor.
real4
a1
=
posBuffer
[
3
*
index
];
real4
a2
=
posBuffer
[
3
*
index
+
1
];
real4
a3
=
posBuffer
[
3
*
index
+
2
];
real4
deltaD1A1
=
delta
(
d1
,
a1
,
periodicBoxSize
,
invPeriodicBoxSize
,
periodicBoxVecX
,
periodicBoxVecY
,
periodicBoxVecZ
);
#ifdef USE_CUTOFF
if
(
deltaD1A1
.
w
<
CUTOFF_SQUARED
)
{
#endif
COMPUTE_DONOR_FORCE
#ifdef USE_CUTOFF
}
#endif
}
}
}
// Write results
if
(
donorIndex
<
NUM_DONORS
)
{
if
(
atoms
.
x
>
-
1
)
{
atomicAdd
(
&
force
[
atoms
.
x
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f1
.
x
*
0x100000000
)));
atomicAdd
(
&
force
[
atoms
.
x
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f1
.
y
*
0x100000000
)));
atomicAdd
(
&
force
[
atoms
.
x
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f1
.
z
*
0x100000000
)));
__threadfence_block
();
}
if
(
atoms
.
y
>
-
1
)
{
atomicAdd
(
&
force
[
atoms
.
y
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f2
.
x
*
0x100000000
)));
atomicAdd
(
&
force
[
atoms
.
y
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f2
.
y
*
0x100000000
)));
atomicAdd
(
&
force
[
atoms
.
y
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f2
.
z
*
0x100000000
)));
__threadfence_block
();
}
if
(
atoms
.
z
>
-
1
)
{
atomicAdd
(
&
force
[
atoms
.
z
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f3
.
x
*
0x100000000
)));
atomicAdd
(
&
force
[
atoms
.
z
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f3
.
y
*
0x100000000
)));
atomicAdd
(
&
force
[
atoms
.
z
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f3
.
z
*
0x100000000
)));
__threadfence_block
();
}
}
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
/**
* Compute forces on acceptors.
*/
extern
"C"
__global__
void
computeAcceptorForces
(
unsigned
long
long
*
__restrict__
force
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
exclusions
,
const
int4
*
__restrict__
donorAtoms
,
const
int4
*
__restrict__
acceptorAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
PARAMETER_ARGUMENTS
)
{
extern
__shared__
real4
posBuffer
[];
real3
f1
=
make_real3
(
0
);
real3
f2
=
make_real3
(
0
);
real3
f3
=
make_real3
(
0
);
for
(
int
acceptorStart
=
0
;
acceptorStart
<
NUM_ACCEPTORS
;
acceptorStart
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Load information about the acceptor this thread will compute forces on.
int
acceptorIndex
=
acceptorStart
+
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int4
atoms
,
exclusionIndices
;
real4
a1
,
a2
,
a3
;
if
(
acceptorIndex
<
NUM_ACCEPTORS
)
{
atoms
=
acceptorAtoms
[
acceptorIndex
];
a1
=
(
atoms
.
x
>
-
1
?
posq
[
atoms
.
x
]
:
make_real4
(
0
));
a2
=
(
atoms
.
y
>
-
1
?
posq
[
atoms
.
y
]
:
make_real4
(
0
));
a3
=
(
atoms
.
z
>
-
1
?
posq
[
atoms
.
z
]
:
make_real4
(
0
));
#ifdef USE_EXCLUSIONS
exclusionIndices
=
exclusions
[
acceptorIndex
];
#endif
}
else
atoms
=
make_int4
(
-
1
,
-
1
,
-
1
,
-
1
);
for
(
int
donorStart
=
0
;
donorStart
<
NUM_DONORS
;
donorStart
+=
blockDim
.
x
)
{
// Load the next block of donors into local memory.
__syncthreads
();
int
blockSize
=
min
((
int
)
blockDim
.
x
,
NUM_DONORS
-
donorStart
);
if
(
threadIdx
.
x
<
blockSize
)
{
int4
atoms2
=
donorAtoms
[
donorStart
+
threadIdx
.
x
];
posBuffer
[
3
*
threadIdx
.
x
]
=
(
atoms2
.
x
>
-
1
?
posq
[
atoms2
.
x
]
:
make_real4
(
0
));
posBuffer
[
3
*
threadIdx
.
x
+
1
]
=
(
atoms2
.
y
>
-
1
?
posq
[
atoms2
.
y
]
:
make_real4
(
0
));
posBuffer
[
3
*
threadIdx
.
x
+
2
]
=
(
atoms2
.
z
>
-
1
?
posq
[
atoms2
.
z
]
:
make_real4
(
0
));
}
__syncthreads
();
if
(
acceptorIndex
<
NUM_ACCEPTORS
)
{
for
(
int
index
=
0
;
index
<
blockSize
;
index
++
)
{
int
donorIndex
=
donorStart
+
index
;
#ifdef USE_EXCLUSIONS
if
(
donorIndex
==
exclusionIndices
.
x
||
donorIndex
==
exclusionIndices
.
y
||
donorIndex
==
exclusionIndices
.
z
||
donorIndex
==
exclusionIndices
.
w
)
continue
;
#endif
// Compute the interaction between a donor and an acceptor.
real4
d1
=
posBuffer
[
3
*
index
];
real4
d2
=
posBuffer
[
3
*
index
+
1
];
real4
d3
=
posBuffer
[
3
*
index
+
2
];
real4
deltaD1A1
=
delta
(
d1
,
a1
,
periodicBoxSize
,
invPeriodicBoxSize
,
periodicBoxVecX
,
periodicBoxVecY
,
periodicBoxVecZ
);
#ifdef USE_CUTOFF
if
(
deltaD1A1
.
w
<
CUTOFF_SQUARED
)
{
#endif
COMPUTE_ACCEPTOR_FORCE
#ifdef USE_CUTOFF
}
#endif
}
}
}
// Write results
if
(
acceptorIndex
<
NUM_ACCEPTORS
)
{
if
(
atoms
.
x
>
-
1
)
{
atomicAdd
(
&
force
[
atoms
.
x
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f1
.
x
*
0x100000000
)));
atomicAdd
(
&
force
[
atoms
.
x
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f1
.
y
*
0x100000000
)));
atomicAdd
(
&
force
[
atoms
.
x
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f1
.
z
*
0x100000000
)));
__threadfence_block
();
}
if
(
atoms
.
y
>
-
1
)
{
atomicAdd
(
&
force
[
atoms
.
y
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f2
.
x
*
0x100000000
)));
atomicAdd
(
&
force
[
atoms
.
y
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f2
.
y
*
0x100000000
)));
atomicAdd
(
&
force
[
atoms
.
y
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f2
.
z
*
0x100000000
)));
__threadfence_block
();
}
if
(
atoms
.
z
>
-
1
)
{
atomicAdd
(
&
force
[
atoms
.
z
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f3
.
x
*
0x100000000
)));
atomicAdd
(
&
force
[
atoms
.
z
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f3
.
y
*
0x100000000
)));
atomicAdd
(
&
force
[
atoms
.
z
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f3
.
z
*
0x100000000
)));
__threadfence_block
();
}
}
}
}
platforms/cuda/src/kernels/customIntegratorPerDof.cu
deleted
100644 → 0
View file @
8dd60914
/**
* Load the position of a particle.
*/
inline
__device__
mixed4
loadPos
(
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
posqCorrection
,
int
index
)
{
#ifdef USE_MIXED_PRECISION
real4
pos1
=
posq
[
index
];
real4
pos2
=
posqCorrection
[
index
];
return
make_mixed4
(
pos1
.
x
+
(
mixed
)
pos2
.
x
,
pos1
.
y
+
(
mixed
)
pos2
.
y
,
pos1
.
z
+
(
mixed
)
pos2
.
z
,
pos1
.
w
);
#else
return
posq
[
index
];
#endif
}
/**
* Store the position of a particle.
*/
inline
__device__
void
storePos
(
real4
*
__restrict__
posq
,
real4
*
__restrict__
posqCorrection
,
int
index
,
mixed4
pos
)
{
#ifdef USE_MIXED_PRECISION
posq
[
index
]
=
make_real4
((
real
)
pos
.
x
,
(
real
)
pos
.
y
,
(
real
)
pos
.
z
,
(
real
)
pos
.
w
);
posqCorrection
[
index
]
=
make_real4
(
pos
.
x
-
(
real
)
pos
.
x
,
pos
.
y
-
(
real
)
pos
.
y
,
pos
.
z
-
(
real
)
pos
.
z
,
0
);
#else
posq
[
index
]
=
pos
;
#endif
}
inline
__device__
double4
convertToDouble4
(
float4
a
)
{
return
make_double4
(
a
.
x
,
a
.
y
,
a
.
z
,
a
.
w
);
}
inline
__device__
double4
convertToDouble4
(
double4
a
)
{
return
a
;
}
inline
__device__
mixed4
convertFromDouble4
(
double4
a
)
{
return
make_mixed4
(
a
.
x
,
a
.
y
,
a
.
z
,
a
.
w
);
}
extern
"C"
__global__
void
computePerDof
(
real4
*
__restrict__
posq
,
real4
*
__restrict__
posqCorrection
,
mixed4
*
__restrict__
posDelta
,
mixed4
*
__restrict__
velm
,
const
long
long
*
__restrict__
force
,
const
mixed2
*
__restrict__
dt
,
const
mixed
*
__restrict__
globals
,
mixed
*
__restrict__
sum
,
const
float4
*
__restrict__
gaussianValues
,
unsigned
int
gaussianBaseIndex
,
const
float4
*
__restrict__
uniformValues
,
const
mixed
energy
,
mixed
*
__restrict__
energyParamDerivs
PARAMETER_ARGUMENTS
)
{
double3
stepSize
=
make_double3
(
dt
[
0
].
y
);
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
double
forceScale
=
1.0
/
0xFFFFFFFF
;
while
(
index
<
NUM_ATOMS
)
{
#ifdef LOAD_POS_AS_DELTA
double4
position
=
convertToDouble4
(
loadPos
(
posq
,
posqCorrection
,
index
)
+
posDelta
[
index
]);
#else
double4
position
=
convertToDouble4
(
loadPos
(
posq
,
posqCorrection
,
index
));
#endif
double4
velocity
=
convertToDouble4
(
velm
[
index
]);
double4
f
=
make_double4
(
forceScale
*
force
[
index
],
forceScale
*
force
[
index
+
PADDED_NUM_ATOMS
],
forceScale
*
force
[
index
+
PADDED_NUM_ATOMS
*
2
],
0.0
);
double3
mass
=
make_double3
(
1.0
/
velocity
.
w
);
if
(
velocity
.
w
!=
0.0
)
{
int
gaussianIndex
=
gaussianBaseIndex
;
int
uniformIndex
=
0
;
COMPUTE_STEP
}
index
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
platforms/cuda/src/kernels/customManyParticle.cu
deleted
100644 → 0
View file @
8dd60914
/**
* Record the force on an atom to global memory.
*/
inline
__device__
void
storeForce
(
int
atom
,
real3
force
,
unsigned
long
long
*
__restrict__
forceBuffers
)
{
atomicAdd
(
&
forceBuffers
[
atom
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
}
/**
* Convert a real4 to a real3 by removing its last element.
*/
inline
__device__
real3
trim
(
real4
v
)
{
return
make_real3
(
v
.
x
,
v
.
y
,
v
.
z
);
}
/**
* Compute the difference between two vectors, taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude.
*/
inline
__device__
real4
delta
(
real3
vec1
,
real3
vec2
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
)
{
real4
result
=
make_real4
(
vec1
.
x
-
vec2
.
x
,
vec1
.
y
-
vec2
.
y
,
vec1
.
z
-
vec2
.
z
,
0.0
f
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
result
)
#endif
result
.
w
=
result
.
x
*
result
.
x
+
result
.
y
*
result
.
y
+
result
.
z
*
result
.
z
;
return
result
;
}
/**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/
__device__
real
computeAngle
(
real4
vec1
,
real4
vec2
)
{
real
dotProduct
=
vec1
.
x
*
vec2
.
x
+
vec1
.
y
*
vec2
.
y
+
vec1
.
z
*
vec2
.
z
;
real
cosine
=
dotProduct
*
RSQRT
(
vec1
.
w
*
vec2
.
w
);
real
angle
;
if
(
cosine
>
0.99
f
||
cosine
<
-
0.99
f
)
{
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
real3
crossProduct
=
cross
(
vec1
,
vec2
);
real
scale
=
vec1
.
w
*
vec2
.
w
;
angle
=
ASIN
(
SQRT
(
dot
(
crossProduct
,
crossProduct
)
/
scale
));
if
(
cosine
<
0.0
f
)
angle
=
M_PI
-
angle
;
}
else
angle
=
ACOS
(
cosine
);
return
angle
;
}
/**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/
inline
__device__
real4
computeCross
(
real4
vec1
,
real4
vec2
)
{
real3
cp
=
cross
(
vec1
,
vec2
);
return
make_real4
(
cp
.
x
,
cp
.
y
,
cp
.
z
,
cp
.
x
*
cp
.
x
+
cp
.
y
*
cp
.
y
+
cp
.
z
*
cp
.
z
);
}
/**
* Determine whether a particular interaction is in the list of exclusions.
*/
inline
__device__
bool
isInteractionExcluded
(
int
atom1
,
int
atom2
,
const
int
*
__restrict__
exclusions
,
const
int
*
__restrict__
exclusionStartIndex
)
{
if
(
atom1
>
atom2
)
{
int
temp
=
atom1
;
atom1
=
atom2
;
atom2
=
temp
;
}
int
first
=
exclusionStartIndex
[
atom1
];
int
last
=
exclusionStartIndex
[
atom1
+
1
];
for
(
int
i
=
last
-
1
;
i
>=
first
;
i
--
)
{
int
excluded
=
exclusions
[
i
];
if
(
excluded
==
atom2
)
return
true
;
if
(
excluded
<=
atom1
)
return
false
;
}
return
false
;
}
__constant__
float
globals
[
NUM_GLOBALS
];
/**
* Compute the interaction.
*/
extern
"C"
__global__
void
computeInteraction
(
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
#ifdef USE_CUTOFF
,
const
int
*
__restrict__
neighbors
,
const
int
*
__restrict__
neighborStartIndex
#endif
#ifdef USE_FILTERS
,
int
*
__restrict__
particleTypes
,
int
*
__restrict__
orderIndex
,
int
*
__restrict__
particleOrder
#endif
#ifdef USE_EXCLUSIONS
,
int
*
__restrict__
exclusions
,
int
*
__restrict__
exclusionStartIndex
#endif
PARAMETER_ARGUMENTS
)
{
mixed
energy
=
0
;
// Loop over particles to be the first one in the set.
for
(
int
p1
=
blockIdx
.
x
;
p1
<
NUM_ATOMS
;
p1
+=
gridDim
.
x
)
{
#ifdef USE_CENTRAL_PARTICLE
const
int
a1
=
p1
;
#else
const
int
a1
=
0
;
#endif
#ifdef USE_CUTOFF
int
firstNeighbor
=
neighborStartIndex
[
p1
];
int
numNeighbors
=
neighborStartIndex
[
p1
+
1
]
-
firstNeighbor
;
#else
#ifdef USE_CENTRAL_PARTICLE
int
numNeighbors
=
NUM_ATOMS
;
#else
int
numNeighbors
=
NUM_ATOMS
-
p1
-
1
;
#endif
#endif
int
numCombinations
=
NUM_CANDIDATE_COMBINATIONS
;
for
(
int
index
=
threadIdx
.
x
;
index
<
numCombinations
;
index
+=
blockDim
.
x
)
{
FIND_ATOMS_FOR_COMBINATION_INDEX
;
bool
includeInteraction
=
IS_VALID_COMBINATION
;
#ifdef USE_CUTOFF
if
(
includeInteraction
)
{
VERIFY_CUTOFF
;
}
#endif
#ifdef USE_FILTERS
int
order
=
orderIndex
[
COMPUTE_TYPE_INDEX
];
if
(
order
==
-
1
)
includeInteraction
=
false
;
#endif
#ifdef USE_EXCLUSIONS
if
(
includeInteraction
)
{
VERIFY_EXCLUSIONS
;
}
#endif
if
(
includeInteraction
)
{
PERMUTE_ATOMS
;
LOAD_PARTICLE_DATA
;
COMPUTE_INTERACTION
;
}
}
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
/**
* Find a bounding box for the atoms in each block.
*/
extern
"C"
__global__
void
findBlockBounds
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
const
real4
*
__restrict__
posq
,
real4
*
__restrict__
blockCenter
,
real4
*
__restrict__
blockBoundingBox
,
int
*
__restrict__
numNeighborPairs
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
base
=
index
*
TILE_SIZE
;
while
(
base
<
NUM_ATOMS
)
{
real4
pos
=
posq
[
base
];
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_POS
(
pos
)
#endif
real4
minPos
=
pos
;
real4
maxPos
=
pos
;
int
last
=
min
(
base
+
TILE_SIZE
,
NUM_ATOMS
);
for
(
int
i
=
base
+
1
;
i
<
last
;
i
++
)
{
pos
=
posq
[
i
];
#ifdef USE_PERIODIC
real4
center
=
0.5
f
*
(
maxPos
+
minPos
);
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos
,
center
)
#endif
minPos
=
make_real4
(
min
(
minPos
.
x
,
pos
.
x
),
min
(
minPos
.
y
,
pos
.
y
),
min
(
minPos
.
z
,
pos
.
z
),
0
);
maxPos
=
make_real4
(
max
(
maxPos
.
x
,
pos
.
x
),
max
(
maxPos
.
y
,
pos
.
y
),
max
(
maxPos
.
z
,
pos
.
z
),
0
);
}
real4
blockSize
=
0.5
f
*
(
maxPos
-
minPos
);
blockBoundingBox
[
index
]
=
blockSize
;
blockCenter
[
index
]
=
0.5
f
*
(
maxPos
+
minPos
);
index
+=
blockDim
.
x
*
gridDim
.
x
;
base
=
index
*
TILE_SIZE
;
}
if
(
blockIdx
.
x
==
0
&&
threadIdx
.
x
==
0
)
*
numNeighborPairs
=
0
;
}
/**
* Find a list of neighbors for each atom.
*/
extern
"C"
__global__
void
findNeighbors
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockBoundingBox
,
int2
*
__restrict__
neighborPairs
,
int
*
__restrict__
numNeighborPairs
,
int
*
__restrict__
numNeighborsForAtom
,
int
maxNeighborPairs
#ifdef USE_EXCLUSIONS
,
const
int
*
__restrict__
exclusions
,
const
int
*
__restrict__
exclusionStartIndex
#endif
)
{
__shared__
real3
positionCache
[
FIND_NEIGHBORS_WORKGROUP_SIZE
];
int
indexInWarp
=
threadIdx
.
x
%
32
;
for
(
int
atom1
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
atom1
<
PADDED_NUM_ATOMS
;
atom1
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Load data for this atom. Note that all threads in a warp are processing atoms from the same block.
real3
pos1
=
trim
(
posq
[
atom1
]);
int
block1
=
atom1
/
TILE_SIZE
;
real4
blockCenter1
=
blockCenter
[
block1
];
real4
blockSize1
=
blockBoundingBox
[
block1
];
int
totalNeighborsForAtom1
=
0
;
// Loop over atom blocks to search for neighbors. The threads in a warp compare block1 against 32
// other blocks in parallel.
#ifdef USE_CENTRAL_PARTICLE
int
startBlock
=
0
;
#else
int
startBlock
=
block1
;
#endif
for
(
int
block2Base
=
startBlock
;
block2Base
<
NUM_BLOCKS
;
block2Base
+=
32
)
{
int
block2
=
block2Base
+
indexInWarp
;
bool
includeBlock2
=
(
block2
<
NUM_BLOCKS
);
if
(
includeBlock2
)
{
real4
blockCenter2
=
blockCenter
[
block2
];
real4
blockSize2
=
blockBoundingBox
[
block2
];
real4
blockDelta
=
blockCenter1
-
blockCenter2
;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
blockDelta
)
#endif
blockDelta
.
x
=
max
(
0.0
f
,
fabs
(
blockDelta
.
x
)
-
blockSize1
.
x
-
blockSize2
.
x
);
blockDelta
.
y
=
max
(
0.0
f
,
fabs
(
blockDelta
.
y
)
-
blockSize1
.
y
-
blockSize2
.
y
);
blockDelta
.
z
=
max
(
0.0
f
,
fabs
(
blockDelta
.
z
)
-
blockSize1
.
z
-
blockSize2
.
z
);
includeBlock2
&=
(
blockDelta
.
x
*
blockDelta
.
x
+
blockDelta
.
y
*
blockDelta
.
y
+
blockDelta
.
z
*
blockDelta
.
z
<
CUTOFF_SQUARED
);
}
// Loop over any blocks we identified as potentially containing neighbors.
int
includeBlockFlags
=
BALLOT
(
includeBlock2
);
while
(
includeBlockFlags
!=
0
)
{
int
i
=
__ffs
(
includeBlockFlags
)
-
1
;
includeBlockFlags
&=
includeBlockFlags
-
1
;
int
block2
=
block2Base
+
i
;
// Loop over atoms in this block.
int
start
=
block2
*
TILE_SIZE
;
int
included
[
TILE_SIZE
];
int
numIncluded
=
0
;
positionCache
[
threadIdx
.
x
]
=
trim
(
posq
[
start
+
indexInWarp
]);
if
(
atom1
<
NUM_ATOMS
)
{
for
(
int
j
=
0
;
j
<
32
;
j
++
)
{
int
atom2
=
start
+
j
;
real3
pos2
=
positionCache
[
threadIdx
.
x
-
indexInWarp
+
j
];
// Decide whether to include this atom pair in the neighbor list.
real4
atomDelta
=
delta
(
pos1
,
pos2
,
periodicBoxSize
,
invPeriodicBoxSize
,
periodicBoxVecX
,
periodicBoxVecY
,
periodicBoxVecZ
);
#ifdef USE_CENTRAL_PARTICLE
bool
includeAtom
=
(
atom2
!=
atom1
&&
atom2
<
NUM_ATOMS
&&
atomDelta
.
w
<
CUTOFF_SQUARED
);
#else
bool
includeAtom
=
(
atom2
>
atom1
&&
atom2
<
NUM_ATOMS
&&
atomDelta
.
w
<
CUTOFF_SQUARED
);
#endif
#ifdef USE_EXCLUSIONS
if
(
includeAtom
)
includeAtom
&=
!
isInteractionExcluded
(
atom1
,
atom2
,
exclusions
,
exclusionStartIndex
);
#endif
if
(
includeAtom
)
included
[
numIncluded
++
]
=
atom2
;
}
}
// If we found any neighbors, store them to the neighbor list.
if
(
numIncluded
>
0
)
{
int
baseIndex
=
atomicAdd
(
numNeighborPairs
,
numIncluded
);
if
(
baseIndex
+
numIncluded
<=
maxNeighborPairs
)
for
(
int
j
=
0
;
j
<
numIncluded
;
j
++
)
neighborPairs
[
baseIndex
+
j
]
=
make_int2
(
atom1
,
included
[
j
]);
totalNeighborsForAtom1
+=
numIncluded
;
}
}
}
if
(
atom1
<
NUM_ATOMS
)
numNeighborsForAtom
[
atom1
]
=
totalNeighborsForAtom1
;
}
}
/**
* Sum the neighbor counts to compute the start position of each atom. This kernel
* is executed as a single work group.
*/
extern
"C"
__global__
void
computeNeighborStartIndices
(
int
*
__restrict__
numNeighborsForAtom
,
int
*
__restrict__
neighborStartIndex
,
int
*
__restrict__
numNeighborPairs
,
int
maxNeighborPairs
)
{
extern
__shared__
unsigned
int
posBuffer
[];
if
(
*
numNeighborPairs
>
maxNeighborPairs
)
{
// There wasn't enough memory for the neighbor list, so we'll need to rebuild it. Set the neighbor start
// indices to indicate no neighbors for any atom.
for
(
int
i
=
threadIdx
.
x
;
i
<=
NUM_ATOMS
;
i
+=
blockDim
.
x
)
neighborStartIndex
[
i
]
=
0
;
return
;
}
unsigned
int
globalOffset
=
0
;
for
(
unsigned
int
startAtom
=
0
;
startAtom
<
NUM_ATOMS
;
startAtom
+=
blockDim
.
x
)
{
// Load the neighbor counts into local memory.
unsigned
int
globalIndex
=
startAtom
+
threadIdx
.
x
;
posBuffer
[
threadIdx
.
x
]
=
(
globalIndex
<
NUM_ATOMS
?
numNeighborsForAtom
[
globalIndex
]
:
0
);
__syncthreads
();
// Perform a parallel prefix sum.
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
unsigned
int
add
=
(
threadIdx
.
x
>=
step
?
posBuffer
[
threadIdx
.
x
-
step
]
:
0
);
__syncthreads
();
posBuffer
[
threadIdx
.
x
]
+=
add
;
__syncthreads
();
}
// Write the results back to global memory.
if
(
globalIndex
<
NUM_ATOMS
)
{
neighborStartIndex
[
globalIndex
+
1
]
=
posBuffer
[
threadIdx
.
x
]
+
globalOffset
;
numNeighborsForAtom
[
globalIndex
]
=
0
;
// Clear this so the next kernel can use it as a counter
}
globalOffset
+=
posBuffer
[
blockDim
.
x
-
1
];
__syncthreads
();
}
if
(
threadIdx
.
x
==
0
)
neighborStartIndex
[
0
]
=
0
;
}
/**
* Assemble the final neighbor list.
*/
extern
"C"
__global__
void
copyPairsToNeighborList
(
const
int2
*
__restrict__
neighborPairs
,
int
*
__restrict__
neighbors
,
int
*
__restrict__
numNeighborPairs
,
int
maxNeighborPairs
,
int
*
__restrict__
numNeighborsForAtom
,
const
int
*
__restrict__
neighborStartIndex
)
{
int
actualPairs
=
*
numNeighborPairs
;
if
(
actualPairs
>
maxNeighborPairs
)
return
;
// There wasn't enough memory for the neighbor list, so we'll need to rebuild it.
for
(
unsigned
int
index
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
index
<
actualPairs
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int2
pair
=
neighborPairs
[
index
];
int
startIndex
=
neighborStartIndex
[
pair
.
x
];
int
offset
=
atomicAdd
(
numNeighborsForAtom
+
pair
.
x
,
1
);
neighbors
[
startIndex
+
offset
]
=
pair
.
y
;
}
}
platforms/cuda/src/kernels/customNonbondedGroups.cu
deleted
100644 → 0
View file @
8dd60914
typedef
struct
{
real
x
,
y
,
z
;
real
q
;
real
fx
,
fy
,
fz
;
ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
real
padding
;
#endif
}
AtomData
;
/**
* Find the maximum of a value across all threads in a warp, and return that to
* every thread. This is only needed on Volta and later. On earlier architectures,
* we can just return the value that was passed in.
*/
__device__
int
reduceMax
(
int
val
)
{
#if __CUDA_ARCH__ >= 700
for
(
int
mask
=
16
;
mask
>
0
;
mask
/=
2
)
val
=
max
(
val
,
__shfl_xor_sync
(
0xffffffff
,
val
,
mask
));
#endif
return
val
;
}
extern
"C"
__global__
void
computeInteractionGroups
(
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
groupData
,
const
int
*
__restrict__
numGroupTiles
,
bool
useNeighborList
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
// global warpIndex
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
// index within the warp
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
// block warpIndex
mixed
energy
=
0
;
INIT_DERIVATIVES
__shared__
AtomData
localData
[
LOCAL_MEMORY_SIZE
];
const
unsigned
int
startTile
=
(
useNeighborList
?
warp
*
numGroupTiles
[
0
]
/
totalWarps
:
FIRST_TILE
+
warp
*
(
LAST_TILE
-
FIRST_TILE
)
/
totalWarps
);
const
unsigned
int
endTile
=
(
useNeighborList
?
(
warp
+
1
)
*
numGroupTiles
[
0
]
/
totalWarps
:
FIRST_TILE
+
(
warp
+
1
)
*
(
LAST_TILE
-
FIRST_TILE
)
/
totalWarps
);
for
(
int
tile
=
startTile
;
tile
<
endTile
;
tile
++
)
{
const
int4
atomData
=
groupData
[
TILE_SIZE
*
tile
+
tgx
];
const
int
atom1
=
atomData
.
x
;
const
int
atom2
=
atomData
.
y
;
const
int
rangeStart
=
atomData
.
z
&
0xFFFF
;
const
int
rangeEnd
=
(
atomData
.
z
>>
16
)
&
0xFFFF
;
const
int
exclusions
=
atomData
.
w
;
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
real3
force
=
make_real3
(
0
);
real4
posq2
=
posq
[
atom2
];
localData
[
threadIdx
.
x
].
x
=
posq2
.
x
;
localData
[
threadIdx
.
x
].
y
=
posq2
.
y
;
localData
[
threadIdx
.
x
].
z
=
posq2
.
z
;
localData
[
threadIdx
.
x
].
q
=
posq2
.
w
;
LOAD_LOCAL_PARAMETERS
localData
[
threadIdx
.
x
].
fx
=
0.0
f
;
localData
[
threadIdx
.
x
].
fy
=
0.0
f
;
localData
[
threadIdx
.
x
].
fz
=
0.0
f
;
int
tj
=
tgx
;
int
rangeStop
=
rangeStart
+
reduceMax
(
rangeEnd
-
rangeStart
);
SYNC_WARPS
;
for
(
int
j
=
rangeStart
;
j
<
rangeStop
;
j
++
)
{
if
(
j
<
rangeEnd
)
{
bool
isExcluded
=
(((
exclusions
>>
tj
)
&
1
)
==
0
);
int
localIndex
=
tbx
+
tj
;
posq2
=
make_real4
(
localData
[
localIndex
].
x
,
localData
[
localIndex
].
y
,
localData
[
localIndex
].
z
,
localData
[
localIndex
].
q
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
!
isExcluded
&&
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
LOAD_ATOM2_PARAMETERS
real
dEdR
=
0.0
f
;
real
tempEnergy
=
0.0
f
;
const
real
interactionScale
=
1.0
f
;
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
localIndex
].
fx
+=
delta
.
x
;
localData
[
localIndex
].
fy
+=
delta
.
y
;
localData
[
localIndex
].
fz
+=
delta
.
z
;
#ifdef USE_CUTOFF
}
#endif
tj
=
(
tj
==
rangeEnd
-
1
?
rangeStart
:
tj
+
1
);
}
SYNC_WARPS
;
}
if
(
exclusions
!=
0
)
{
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
}
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0x100000000
)));
SYNC_WARPS
;
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
SAVE_DERIVATIVES
}
/**
* If the neighbor list needs to be rebuilt, reset the number of tiles to 0. This is
* executed by a single thread.
*/
extern
"C"
__global__
void
prepareToBuildNeighborList
(
int
*
__restrict__
rebuildNeighborList
,
int
*
__restrict__
numGroupTiles
)
{
if
(
rebuildNeighborList
[
0
]
==
1
)
numGroupTiles
[
0
]
=
0
;
}
/**
* Filter the list of tiles to include only ones that have interactions within the
* padded cutoff.
*/
extern
"C"
__global__
void
buildNeighborList
(
int
*
__restrict__
rebuildNeighborList
,
int
*
__restrict__
numGroupTiles
,
const
real4
*
__restrict__
posq
,
const
int4
*
__restrict__
groupData
,
int4
*
__restrict__
filteredGroupData
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
)
{
// If the neighbor list doesn't need to be rebuilt on this step, return immediately.
if
(
rebuildNeighborList
[
0
]
==
0
)
return
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
// global warpIndex
const
unsigned
int
local_warp
=
threadIdx
.
x
/
TILE_SIZE
;
// local warpIndex
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
// index within the warp
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
// block warpIndex
__shared__
real4
localPos
[
LOCAL_MEMORY_SIZE
];
__shared__
volatile
bool
anyInteraction
[
WARPS_IN_BLOCK
];
__shared__
volatile
int
tileIndex
[
WARPS_IN_BLOCK
];
const
unsigned
int
startTile
=
warp
*
NUM_TILES
/
totalWarps
;
const
unsigned
int
endTile
=
(
warp
+
1
)
*
NUM_TILES
/
totalWarps
;
for
(
int
tile
=
startTile
;
tile
<
endTile
;
tile
++
)
{
const
int4
atomData
=
groupData
[
TILE_SIZE
*
tile
+
tgx
];
const
int
atom1
=
atomData
.
x
;
const
int
atom2
=
atomData
.
y
;
const
int
rangeStart
=
atomData
.
z
&
0xFFFF
;
const
int
rangeEnd
=
(
atomData
.
z
>>
16
)
&
0xFFFF
;
const
int
exclusions
=
atomData
.
w
;
real4
posq1
=
posq
[
atom1
];
localPos
[
threadIdx
.
x
]
=
posq
[
atom2
];
if
(
tgx
==
0
)
anyInteraction
[
local_warp
]
=
false
;
int
tj
=
tgx
;
int
rangeStop
=
rangeStart
+
reduceMax
(
rangeEnd
-
rangeStart
);
SYNC_WARPS
;
for
(
int
j
=
rangeStart
;
j
<
rangeStop
&&
!
anyInteraction
[
local_warp
];
j
++
)
{
SYNC_WARPS
;
if
(
j
<
rangeEnd
&&
tj
<
rangeEnd
)
{
bool
isExcluded
=
(((
exclusions
>>
tj
)
&
1
)
==
0
);
int
localIndex
=
tbx
+
tj
;
real3
delta
=
make_real3
(
localPos
[
localIndex
].
x
-
posq1
.
x
,
localPos
[
localIndex
].
y
-
posq1
.
y
,
localPos
[
localIndex
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
!
isExcluded
&&
r2
<
PADDED_CUTOFF_SQUARED
)
anyInteraction
[
local_warp
]
=
true
;
}
tj
=
(
tj
==
rangeEnd
-
1
?
rangeStart
:
tj
+
1
);
SYNC_WARPS
;
}
if
(
anyInteraction
[
local_warp
])
{
SYNC_WARPS
;
if
(
tgx
==
0
)
tileIndex
[
local_warp
]
=
atomicAdd
(
numGroupTiles
,
1
);
SYNC_WARPS
;
filteredGroupData
[
TILE_SIZE
*
tileIndex
[
local_warp
]
+
tgx
]
=
atomData
;
}
}
}
platforms/cuda/src/kernels/gbsaObc1.cu
deleted
100644 → 0
View file @
8dd60914
#define DIELECTRIC_OFFSET 0.009f
#define PROBE_RADIUS 0.14f
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
/**
* Reduce the Born sums to compute the Born radii.
*/
extern
"C"
__global__
void
reduceBornSum
(
float
alpha
,
float
beta
,
float
gamma
,
const
long
long
*
__restrict__
bornSum
,
const
float2
*
__restrict__
params
,
real
*
__restrict__
bornRadii
,
real
*
__restrict__
obcChain
)
{
for
(
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Get summed Born data
real
sum
=
RECIP
(
0x100000000
)
*
bornSum
[
index
];
// Now calculate Born radius and OBC term.
float
offsetRadius
=
params
[
index
].
x
;
sum
*=
0.5
f
*
offsetRadius
;
real
sum2
=
sum
*
sum
;
real
sum3
=
sum
*
sum2
;
real
tanhSum
=
tanh
(
alpha
*
sum
-
beta
*
sum2
+
gamma
*
sum3
);
real
nonOffsetRadius
=
offsetRadius
+
DIELECTRIC_OFFSET
;
real
radius
=
RECIP
(
RECIP
(
offsetRadius
)
-
tanhSum
/
nonOffsetRadius
);
real
chain
=
offsetRadius
*
(
alpha
-
2.0
f
*
beta
*
sum
+
3.0
f
*
gamma
*
sum2
);
chain
=
(
1
-
tanhSum
*
tanhSum
)
*
chain
/
nonOffsetRadius
;
bornRadii
[
index
]
=
radius
;
obcChain
[
index
]
=
chain
;
}
}
/**
* Reduce the Born force.
*/
extern
"C"
__global__
void
reduceBornForce
(
long
long
*
__restrict__
bornForce
,
mixed
*
__restrict__
energyBuffer
,
const
float2
*
__restrict__
params
,
const
real
*
__restrict__
bornRadii
,
const
real
*
__restrict__
obcChain
)
{
mixed
energy
=
0
;
for
(
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Get summed Born force
real
force
=
RECIP
(
0x100000000
)
*
bornForce
[
index
];
// Now calculate the actual force
float
offsetRadius
=
params
[
index
].
x
;
real
bornRadius
=
bornRadii
[
index
];
real
r
=
offsetRadius
+
DIELECTRIC_OFFSET
+
PROBE_RADIUS
;
real
ratio6
=
POW
((
offsetRadius
+
DIELECTRIC_OFFSET
)
/
bornRadius
,
6
);
real
saTerm
=
SURFACE_AREA_FACTOR
*
r
*
r
*
ratio6
;
force
+=
saTerm
/
bornRadius
;
energy
+=
saTerm
;
force
*=
bornRadius
*
bornRadius
*
obcChain
[
index
];
bornForce
[
index
]
=
(
long
long
)
(
force
*
0x100000000
);
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
/-
6
;
}
typedef
struct
{
real
x
,
y
,
z
;
real
q
;
float
radius
,
scaledRadius
;
real
bornSum
;
}
AtomData1
;
/**
* Compute the Born sum.
*/
extern
"C"
__global__
void
computeBornSum
(
unsigned
long
long
*
__restrict__
global_bornSum
,
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
charge
,
const
float2
*
__restrict__
global_params
,
#ifdef USE_CUTOFF
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
,
#else
unsigned
int
numTiles
,
#endif
const
ushort2
*
__restrict__
exclusionTiles
)
{
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
__shared__
AtomData1
localData
[
FORCE_WORK_GROUP_SIZE
];
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real
bornSum
=
0
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
real
charge1
=
charge
[
atom1
];
float2
params1
=
global_params
[
atom1
];
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
x
=
posq1
.
x
;
localData
[
threadIdx
.
x
].
y
=
posq1
.
y
;
localData
[
threadIdx
.
x
].
z
=
posq1
.
z
;
localData
[
threadIdx
.
x
].
q
=
charge1
;
localData
[
threadIdx
.
x
].
radius
=
params1
.
x
;
localData
[
threadIdx
.
x
].
scaledRadius
=
params1
.
y
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
j
].
x
-
posq1
.
x
,
localData
[
tbx
+
j
].
y
-
posq1
.
y
,
localData
[
tbx
+
j
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
float2
params2
=
make_float2
(
localData
[
tbx
+
j
].
radius
,
localData
[
tbx
+
j
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
((
j
!=
tgx
)
&&
(
params1
.
x
<
rScaledRadiusJ
))
{
real
l_ij
=
RECIP
(
max
(
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusJ
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
bornSum
+=
(
params1
.
x
<
params2
.
y
-
r
?
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
)
:
0
);
}
}
}
}
else
{
// This is an off-diagonal tile.
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq
[
j
];
localData
[
threadIdx
.
x
].
x
=
tempPosq
.
x
;
localData
[
threadIdx
.
x
].
y
=
tempPosq
.
y
;
localData
[
threadIdx
.
x
].
z
=
tempPosq
.
z
;
localData
[
threadIdx
.
x
].
q
=
charge
[
j
];
float2
tempParams
=
global_params
[
j
];
localData
[
threadIdx
.
x
].
radius
=
tempParams
.
x
;
localData
[
threadIdx
.
x
].
scaledRadius
=
tempParams
.
y
;
localData
[
threadIdx
.
x
].
bornSum
=
0.0
f
;
// Compute the full set of interactions in this tile.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
tj
].
x
-
posq1
.
x
,
localData
[
tbx
+
tj
].
y
-
posq1
.
y
,
localData
[
tbx
+
tj
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
tj
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
tj
<
NUM_ATOMS
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
float2
params2
=
make_float2
(
localData
[
tbx
+
tj
].
radius
,
localData
[
tbx
+
tj
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
(
params1
.
x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
(
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusJ
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
bornSum
+=
(
params1
.
x
<
params2
.
y
-
r
?
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
)
:
0
);
}
real
rScaledRadiusI
=
r
+
params1
.
y
;
if
(
params2
.
x
<
rScaledRadiusI
)
{
real
l_ij
=
RECIP
(
max
(
params2
.
x
,
fabs
(
r
-
params1
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusI
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
real
term
=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
term
+=
(
params2
.
x
<
params1
.
y
-
r
?
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
)
:
0
);
localData
[
tbx
+
tj
].
bornSum
+=
term
;
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_bornSum
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
bornSum
*
0x100000000
)));
if
(
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_bornSum
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
bornSum
*
0x100000000
)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
if
(
numTiles
>
maxTiles
)
return
;
// There wasn't enough memory for the neighbor list.
int
pos
=
(
int
)
(
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
long
long
)
NUM_BLOCKS
+
1
)
/
2
:
(
long
)
numTiles
)
/
totalWarps
);
int
end
=
(
int
)
((
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
long
long
)
NUM_BLOCKS
+
1
)
/
2
:
(
long
)
numTiles
)
/
totalWarps
);
#else
int
pos
=
(
int
)
(
warp
*
(
long
long
)
numTiles
/
totalWarps
);
int
end
=
(
int
)
((
warp
+
1
)
*
(
long
long
)
numTiles
/
totalWarps
);
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
FORCE_WORK_GROUP_SIZE
];
__shared__
volatile
int
skipTiles
[
FORCE_WORK_GROUP_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
while
(
pos
<
end
)
{
real
bornSum
=
0
;
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
int
x
,
y
;
bool
singlePeriodicCopy
=
false
;
#ifdef USE_CUTOFF
x
=
tiles
[
pos
];
real4
blockSizeX
=
blockSize
[
x
];
singlePeriodicCopy
=
(
0.5
f
*
periodicBoxSize
.
x
-
blockSizeX
.
x
>=
CUTOFF
&&
0.5
f
*
periodicBoxSize
.
y
-
blockSizeX
.
y
>=
CUTOFF
&&
0.5
f
*
periodicBoxSize
.
z
-
blockSizeX
.
z
>=
CUTOFF
);
#else
y
=
(
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
#endif
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
real4
posq1
=
posq
[
atom1
];
real
charge1
=
charge
[
atom1
];
float2
params1
=
global_params
[
atom1
];
#ifdef USE_CUTOFF
unsigned
int
j
=
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
];
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
real4
tempPosq
=
posq
[
j
];
localData
[
threadIdx
.
x
].
x
=
tempPosq
.
x
;
localData
[
threadIdx
.
x
].
y
=
tempPosq
.
y
;
localData
[
threadIdx
.
x
].
z
=
tempPosq
.
z
;
localData
[
threadIdx
.
x
].
q
=
charge
[
j
];
float2
tempParams
=
global_params
[
j
];
localData
[
threadIdx
.
x
].
radius
=
tempParams
.
x
;
localData
[
threadIdx
.
x
].
scaledRadius
=
tempParams
.
y
;
localData
[
threadIdx
.
x
].
bornSum
=
0.0
f
;
}
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
posq1
,
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
localData
[
threadIdx
.
x
],
blockCenterX
)
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
tj
].
x
-
posq1
.
x
,
localData
[
tbx
+
tj
].
y
-
posq1
.
y
,
localData
[
tbx
+
tj
].
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
int
atom2
=
atomIndices
[
tbx
+
tj
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
float2
params2
=
make_float2
(
localData
[
tbx
+
tj
].
radius
,
localData
[
tbx
+
tj
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
(
params1
.
x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
(
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusJ
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
bornSum
+=
(
params1
.
x
<
params2
.
y
-
r
?
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
)
:
0
);
}
real
rScaledRadiusI
=
r
+
params1
.
y
;
if
(
params2
.
x
<
rScaledRadiusI
)
{
real
l_ij
=
RECIP
(
max
(
params2
.
x
,
fabs
(
r
-
params1
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusI
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
real
term
=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
term
+=
(
params2
.
x
<
params1
.
y
-
r
?
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
)
:
0
);
localData
[
tbx
+
tj
].
bornSum
+=
term
;
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
tj
].
x
-
posq1
.
x
,
localData
[
tbx
+
tj
].
y
-
posq1
.
y
,
localData
[
tbx
+
tj
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
int
atom2
=
atomIndices
[
tbx
+
tj
];
#ifdef USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
float2
params2
=
make_float2
(
localData
[
tbx
+
tj
].
radius
,
localData
[
tbx
+
tj
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
(
params1
.
x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
(
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusJ
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
bornSum
+=
(
params1
.
x
<
params2
.
y
-
r
?
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
)
:
0
);
}
real
rScaledRadiusI
=
r
+
params1
.
y
;
if
(
params2
.
x
<
rScaledRadiusI
)
{
real
l_ij
=
RECIP
(
max
(
params2
.
x
,
fabs
(
r
-
params1
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusI
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
real
term
=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
term
+=
(
params2
.
x
<
params1
.
y
-
r
?
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
)
:
0
);
localData
[
tbx
+
tj
].
bornSum
+=
term
;
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
atomicAdd
(
&
global_bornSum
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
bornSum
*
0x100000000
)));
#ifdef USE_CUTOFF
unsigned
int
atom2
=
atomIndices
[
threadIdx
.
x
];
#else
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
atomicAdd
(
&
global_bornSum
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
bornSum
*
0x100000000
)));
}
pos
++
;
}
}
typedef
struct
{
real
x
,
y
,
z
;
real
q
;
real
fx
,
fy
,
fz
,
fw
;
real
bornRadius
;
}
AtomData2
;
/**
* First part of computing the GBSA interaction.
*/
extern
"C"
__global__
void
computeGBSAForce1
(
unsigned
long
long
*
__restrict__
forceBuffers
,
unsigned
long
long
*
__restrict__
global_bornForce
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
charge
,
const
real
*
__restrict__
global_bornRadii
,
bool
needEnergy
,
#ifdef USE_CUTOFF
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
,
#else
unsigned
int
numTiles
,
#endif
const
ushort2
*
__restrict__
exclusionTiles
)
{
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
mixed
energy
=
0
;
__shared__
AtomData2
localData
[
FORCE_WORK_GROUP_SIZE
];
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real4
force
=
make_real4
(
0
);
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
real
charge1
=
charge
[
atom1
];
real
bornRadius1
=
global_bornRadii
[
atom1
];
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
x
=
posq1
.
x
;
localData
[
threadIdx
.
x
].
y
=
posq1
.
y
;
localData
[
threadIdx
.
x
].
z
=
posq1
.
z
;
localData
[
threadIdx
.
x
].
q
=
charge1
;
localData
[
threadIdx
.
x
].
bornRadius
=
bornRadius1
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
)
{
real3
pos2
=
make_real3
(
localData
[
tbx
+
j
].
x
,
localData
[
tbx
+
j
].
y
,
localData
[
tbx
+
j
].
z
);
real
charge2
=
localData
[
tbx
+
j
].
q
;
real3
delta
=
make_real3
(
pos2
.
x
-
posq1
.
x
,
pos2
.
y
-
posq1
.
y
,
pos2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
real
bornRadius2
=
localData
[
tbx
+
j
].
bornRadius
;
real
alpha2_ij
=
bornRadius1
*
bornRadius2
;
real
D_ij
=
r2
*
RECIP
(
4.0
f
*
alpha2_ij
);
real
expTerm
=
EXP
(
-
D_ij
);
real
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
real
denominator
=
SQRT
(
denominator2
);
real
scaledChargeProduct
=
PREFACTOR
*
charge1
*
charge2
;
real
tempEnergy
=
scaledChargeProduct
*
RECIP
(
denominator
);
real
Gpol
=
tempEnergy
*
RECIP
(
denominator2
);
real
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
real
dEdR
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
force
.
w
+=
dGpol_dalpha2_ij
*
bornRadius2
;
#ifdef USE_CUTOFF
if
(
atom1
!=
y
*
TILE_SIZE
+
j
)
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
#endif
if
(
needEnergy
)
energy
+=
0.5
f
*
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
#ifdef USE_CUTOFF
}
#endif
}
}
}
else
{
// This is an off-diagonal tile.
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq
[
j
];
localData
[
threadIdx
.
x
].
x
=
tempPosq
.
x
;
localData
[
threadIdx
.
x
].
y
=
tempPosq
.
y
;
localData
[
threadIdx
.
x
].
z
=
tempPosq
.
z
;
localData
[
threadIdx
.
x
].
q
=
charge
[
j
];
localData
[
threadIdx
.
x
].
bornRadius
=
global_bornRadii
[
j
];
localData
[
threadIdx
.
x
].
fx
=
0.0
f
;
localData
[
threadIdx
.
x
].
fy
=
0.0
f
;
localData
[
threadIdx
.
x
].
fz
=
0.0
f
;
localData
[
threadIdx
.
x
].
fw
=
0.0
f
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
tj
<
NUM_ATOMS
)
{
real3
pos2
=
make_real3
(
localData
[
tbx
+
tj
].
x
,
localData
[
tbx
+
tj
].
y
,
localData
[
tbx
+
tj
].
z
);
real
charge2
=
localData
[
tbx
+
tj
].
q
;
real3
delta
=
make_real3
(
pos2
.
x
-
posq1
.
x
,
pos2
.
y
-
posq1
.
y
,
pos2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
real
bornRadius2
=
localData
[
tbx
+
tj
].
bornRadius
;
real
alpha2_ij
=
bornRadius1
*
bornRadius2
;
real
D_ij
=
r2
*
RECIP
(
4.0
f
*
alpha2_ij
);
real
expTerm
=
EXP
(
-
D_ij
);
real
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
real
denominator
=
SQRT
(
denominator2
);
real
scaledChargeProduct
=
PREFACTOR
*
charge1
*
charge2
;
real
tempEnergy
=
scaledChargeProduct
*
RECIP
(
denominator
);
real
Gpol
=
tempEnergy
*
RECIP
(
denominator2
);
real
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
real
dEdR
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
force
.
w
+=
dGpol_dalpha2_ij
*
bornRadius2
;
#ifdef USE_CUTOFF
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
#endif
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
localData
[
tbx
+
tj
].
fw
+=
dGpol_dalpha2_ij
*
bornRadius1
;
#ifdef USE_CUTOFF
}
#endif
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
atomicAdd
(
&
global_bornForce
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
w
*
0x100000000
)));
if
(
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0x100000000
)));
atomicAdd
(
&
global_bornForce
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fw
*
0x100000000
)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
if
(
numTiles
>
maxTiles
)
return
;
// There wasn't enough memory for the neighbor list.
int
pos
=
(
int
)
(
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
long
long
)
NUM_BLOCKS
+
1
)
/
2
:
(
long
)
numTiles
)
/
totalWarps
);
int
end
=
(
int
)
((
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
((
long
long
)
NUM_BLOCKS
+
1
)
/
2
:
(
long
)
numTiles
)
/
totalWarps
);
#else
int
pos
=
(
int
)
(
warp
*
(
long
long
)
numTiles
/
totalWarps
);
int
end
=
(
int
)
((
warp
+
1
)
*
(
long
long
)
numTiles
/
totalWarps
);
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
FORCE_WORK_GROUP_SIZE
];
__shared__
volatile
int
skipTiles
[
FORCE_WORK_GROUP_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
while
(
pos
<
end
)
{
real4
force
=
make_real4
(
0
);
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
int
x
,
y
;
bool
singlePeriodicCopy
=
false
;
#ifdef USE_CUTOFF
x
=
tiles
[
pos
];
real4
blockSizeX
=
blockSize
[
x
];
singlePeriodicCopy
=
(
0.5
f
*
periodicBoxSize
.
x
-
blockSizeX
.
x
>=
CUTOFF
&&
0.5
f
*
periodicBoxSize
.
y
-
blockSizeX
.
y
>=
CUTOFF
&&
0.5
f
*
periodicBoxSize
.
z
-
blockSizeX
.
z
>=
CUTOFF
);
#else
y
=
(
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
#endif
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
real4
posq1
=
posq
[
atom1
];
real
charge1
=
charge
[
atom1
];
real
bornRadius1
=
global_bornRadii
[
atom1
];
#ifdef USE_CUTOFF
unsigned
int
j
=
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
];
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
real4
tempPosq
=
posq
[
j
];
localData
[
threadIdx
.
x
].
x
=
tempPosq
.
x
;
localData
[
threadIdx
.
x
].
y
=
tempPosq
.
y
;
localData
[
threadIdx
.
x
].
z
=
tempPosq
.
z
;
localData
[
threadIdx
.
x
].
q
=
charge
[
j
];
localData
[
threadIdx
.
x
].
bornRadius
=
global_bornRadii
[
j
];
localData
[
threadIdx
.
x
].
fx
=
0.0
f
;
localData
[
threadIdx
.
x
].
fy
=
0.0
f
;
localData
[
threadIdx
.
x
].
fz
=
0.0
f
;
localData
[
threadIdx
.
x
].
fw
=
0.0
f
;
}
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
posq1
,
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
localData
[
threadIdx
.
x
],
blockCenterX
)
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
atomIndices
[
tbx
+
tj
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
pos2
=
make_real3
(
localData
[
tbx
+
tj
].
x
,
localData
[
tbx
+
tj
].
y
,
localData
[
tbx
+
tj
].
z
);
real
charge2
=
localData
[
tbx
+
tj
].
q
;
real3
delta
=
make_real3
(
pos2
.
x
-
posq1
.
x
,
pos2
.
y
-
posq1
.
y
,
pos2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
real
bornRadius2
=
localData
[
tbx
+
tj
].
bornRadius
;
real
alpha2_ij
=
bornRadius1
*
bornRadius2
;
real
D_ij
=
r2
*
RECIP
(
4.0
f
*
alpha2_ij
);
real
expTerm
=
EXP
(
-
D_ij
);
real
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
real
denominator
=
SQRT
(
denominator2
);
real
scaledChargeProduct
=
PREFACTOR
*
charge1
*
charge2
;
real
tempEnergy
=
scaledChargeProduct
*
RECIP
(
denominator
);
real
Gpol
=
tempEnergy
*
RECIP
(
denominator2
);
real
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
real
dEdR
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
force
.
w
+=
dGpol_dalpha2_ij
*
bornRadius2
;
#ifdef USE_CUTOFF
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
#endif
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
localData
[
tbx
+
tj
].
fw
+=
dGpol_dalpha2_ij
*
bornRadius1
;
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
atomIndices
[
tbx
+
tj
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
pos2
=
make_real3
(
localData
[
tbx
+
tj
].
x
,
localData
[
tbx
+
tj
].
y
,
localData
[
tbx
+
tj
].
z
);
real
charge2
=
localData
[
tbx
+
tj
].
q
;
real3
delta
=
make_real3
(
pos2
.
x
-
posq1
.
x
,
pos2
.
y
-
posq1
.
y
,
pos2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
real
bornRadius2
=
localData
[
tbx
+
tj
].
bornRadius
;
real
alpha2_ij
=
bornRadius1
*
bornRadius2
;
real
D_ij
=
r2
*
RECIP
(
4.0
f
*
alpha2_ij
);
real
expTerm
=
EXP
(
-
D_ij
);
real
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
real
denominator
=
SQRT
(
denominator2
);
real
scaledChargeProduct
=
PREFACTOR
*
charge1
*
charge2
;
real
tempEnergy
=
scaledChargeProduct
*
RECIP
(
denominator
);
real
Gpol
=
tempEnergy
*
RECIP
(
denominator2
);
real
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
real
dEdR
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
force
.
w
+=
dGpol_dalpha2_ij
*
bornRadius2
;
#ifdef USE_CUTOFF
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
#endif
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
localData
[
tbx
+
tj
].
fw
+=
dGpol_dalpha2_ij
*
bornRadius1
;
#ifdef USE_CUTOFF
}
#endif
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
atomicAdd
(
&
global_bornForce
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
w
*
0x100000000
)));
#ifdef USE_CUTOFF
unsigned
int
atom2
=
atomIndices
[
threadIdx
.
x
];
#else
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0x100000000
)));
atomicAdd
(
&
global_bornForce
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fw
*
0x100000000
)));
}
}
pos
++
;
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
platforms/cuda/src/kernels/nonbondedExceptions.cu
View file @
5a06df78
float4
exceptionParams
=
PARAMS
[
index
];
float4
exceptionParams
=
PARAMS
[
index
];
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
real3
delta
=
make_real3
(
pos2
.
x
-
pos1
.
x
,
pos2
.
y
-
pos1
.
y
,
pos2
.
z
-
pos1
.
z
);
#if APPLY_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
invR
=
RSQRT
(
r2
);
real
invR
=
RSQRT
(
r2
);
real
sig2
=
invR
*
exceptionParams
.
y
;
real
sig2
=
invR
*
exceptionParams
.
y
;
...
...
platforms/cuda/src/kernels/noseHooverChain.cu
0 → 100644
View file @
5a06df78
#include <initializer_list>
extern
"C"
__global__
void
propagateNoseHooverChain
(
mixed2
*
__restrict__
chainData
,
const
mixed2
*
__restrict__
energySum
,
mixed2
*
__restrict__
scaleFactor
,
mixed
*
__restrict__
chainMasses
,
mixed
*
__restrict__
chainForces
,
int
chainType
,
int
chainLength
,
int
numMTS
,
int
numDOFs
,
float
timeStep
,
mixed
kT
,
float
frequency
){
const
mixed
&
kineticEnergy
=
chainType
?
energySum
[
0
].
y
:
energySum
[
0
].
x
;
mixed
&
scale
=
chainType
?
scaleFactor
[
0
].
y
:
scaleFactor
[
0
].
x
;
scale
=
(
mixed
)
1
;
if
(
kineticEnergy
<
1e-8
)
return
;
for
(
int
bead
=
0
;
bead
<
chainLength
;
++
bead
)
chainMasses
[
bead
]
=
kT
/
(
frequency
*
frequency
);
chainMasses
[
0
]
*=
numDOFs
;
mixed
KE2
=
2.0
f
*
kineticEnergy
;
mixed
timeOverMTS
=
timeStep
/
numMTS
;
chainForces
[
0
]
=
(
KE2
-
numDOFs
*
kT
)
/
chainMasses
[
0
];
for
(
int
bead
=
0
;
bead
<
chainLength
-
1
;
++
bead
)
{
chainForces
[
bead
+
1
]
=
(
chainMasses
[
bead
]
*
chainData
[
bead
].
y
*
chainData
[
bead
].
y
-
kT
)
/
chainMasses
[
bead
+
1
];
}
for
(
int
mts
=
0
;
mts
<
numMTS
;
++
mts
)
{
BEGIN_YS_LOOP
mixed
wdt
=
ys
*
timeOverMTS
;
chainData
[
chainLength
-
1
].
y
+=
0.25
f
*
wdt
*
chainForces
[
chainLength
-
1
];
for
(
int
bead
=
chainLength
-
2
;
bead
>=
0
;
--
bead
)
{
mixed
aa
=
MIXEDEXP
(
-
0.125
f
*
wdt
*
chainData
[
bead
+
1
].
y
);
chainData
[
bead
].
y
=
aa
*
(
chainData
[
bead
].
y
*
aa
+
0.25
f
*
wdt
*
chainForces
[
bead
]);
}
// update particle velocities
mixed
aa
=
MIXEDEXP
(
-
0.5
f
*
wdt
*
chainData
[
0
].
y
);
scale
*=
aa
;
// update the thermostat positions
for
(
int
bead
=
0
;
bead
<
chainLength
;
++
bead
)
{
chainData
[
bead
].
x
+=
0.5
f
*
chainData
[
bead
].
y
*
wdt
;
}
// update the forces
chainForces
[
0
]
=
(
scale
*
scale
*
KE2
-
numDOFs
*
kT
)
/
chainMasses
[
0
];
// update thermostat velocities
for
(
int
bead
=
0
;
bead
<
chainLength
-
1
;
++
bead
)
{
mixed
aa
=
MIXEDEXP
(
-
0.125
f
*
wdt
*
chainData
[
bead
+
1
].
y
);
chainData
[
bead
].
y
=
aa
*
(
aa
*
chainData
[
bead
].
y
+
0.25
f
*
wdt
*
chainForces
[
bead
]);
chainForces
[
bead
+
1
]
=
(
chainMasses
[
bead
]
*
chainData
[
bead
].
y
*
chainData
[
bead
].
y
-
kT
)
/
chainMasses
[
bead
+
1
];
}
chainData
[
chainLength
-
1
].
y
+=
0.25
f
*
wdt
*
chainForces
[
chainLength
-
1
];
END_YS_LOOP
}
// MTS loop
}
/**
* Compute total (potential + kinetic) energy of the Nose-Hoover beads
*/
extern
"C"
__global__
void
computeHeatBathEnergy
(
mixed
*
__restrict__
heatBathEnergy
,
int
chainLength
,
int
numDOFs
,
mixed
kT
,
float
frequency
,
const
mixed2
*
__restrict__
chainData
){
// Note that this is always incremented; make sure it's zeroed properly before the first call
mixed
&
energy
=
heatBathEnergy
[
0
];
for
(
int
i
=
0
;
i
<
chainLength
;
++
i
)
{
mixed
prefac
=
i
?
1
:
numDOFs
;
mixed
mass
=
prefac
*
kT
/
(
frequency
*
frequency
);
mixed
velocity
=
chainData
[
i
].
y
;
// The kinetic energy of this bead
energy
+=
0.5
f
*
mass
*
velocity
*
velocity
;
// The potential energy of this bead
mixed
position
=
chainData
[
i
].
x
;
energy
+=
prefac
*
kT
*
position
;
}
}
extern
"C"
__global__
void
computeAtomsKineticEnergy
(
mixed2
*
__restrict__
energyBuffer
,
int
numAtoms
,
const
mixed4
*
__restrict__
velm
,
const
int
*
__restrict__
atoms
){
mixed2
energy
=
make_mixed2
(
0
,
0
);
//energy = 1; return;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numAtoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
atom
=
atoms
[
index
];
mixed4
v
=
velm
[
atom
];
mixed
mass
=
v
.
w
==
0
?
0
:
1
/
v
.
w
;
energy
.
x
+=
0.5
f
*
mass
*
(
v
.
x
*
v
.
x
+
v
.
y
*
v
.
y
+
v
.
z
*
v
.
z
);
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
=
energy
;
}
extern
"C"
__global__
void
computePairsKineticEnergy
(
mixed2
*
__restrict__
energyBuffer
,
int
numPairs
,
const
mixed4
*
__restrict__
velm
,
const
int2
*
__restrict__
pairs
){
mixed2
energy
=
make_mixed2
(
0
,
0
);
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numPairs
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int2
pair
=
pairs
[
index
];
int
atom1
=
pair
.
x
;
int
atom2
=
pair
.
y
;
mixed4
v1
=
velm
[
atom1
];
mixed4
v2
=
velm
[
atom2
];
mixed
m1
=
v1
.
w
==
0
?
0
:
1
/
v1
.
w
;
mixed
m2
=
v2
.
w
==
0
?
0
:
1
/
v2
.
w
;
mixed4
cv
;
cv
.
x
=
(
m1
*
v1
.
x
+
m2
*
v2
.
x
)
/
(
m1
+
m2
);
cv
.
y
=
(
m1
*
v1
.
y
+
m2
*
v2
.
y
)
/
(
m1
+
m2
);
cv
.
z
=
(
m1
*
v1
.
z
+
m2
*
v2
.
z
)
/
(
m1
+
m2
);
mixed4
rv
;
rv
.
x
=
v2
.
x
-
v1
.
x
;
rv
.
y
=
v2
.
y
-
v1
.
y
;
rv
.
z
=
v2
.
z
-
v1
.
z
;
energy
.
x
+=
0.5
f
*
(
m1
+
m2
)
*
(
cv
.
x
*
cv
.
x
+
cv
.
y
*
cv
.
y
+
cv
.
z
*
cv
.
z
);
energy
.
y
+=
0.5
f
*
(
m1
*
m2
/
(
m1
+
m2
))
*
(
rv
.
x
*
rv
.
x
+
rv
.
y
*
rv
.
y
+
rv
.
z
*
rv
.
z
);
}
// The atoms version of this has been called already, so accumulate instead of assigning here
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
].
x
+=
energy
.
x
;
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
].
y
+=
energy
.
y
;
}
extern
"C"
__global__
void
scaleAtomsVelocities
(
mixed2
*
__restrict__
scaleFactor
,
int
numAtoms
,
mixed4
*
__restrict__
velm
,
const
int
*
__restrict__
atoms
){
const
mixed
&
scale
=
scaleFactor
[
0
].
x
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numAtoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
atom
=
atoms
[
index
];
mixed4
&
v
=
velm
[
atom
];
v
.
x
*=
scale
;
v
.
y
*=
scale
;
v
.
z
*=
scale
;
}
}
extern
"C"
__global__
void
scalePairsVelocities
(
mixed2
*
__restrict__
scaleFactor
,
int
numPairs
,
mixed4
*
__restrict__
velm
,
const
int2
*
__restrict__
pairs
){
const
mixed
&
absScale
=
scaleFactor
[
0
].
x
;
const
mixed
&
relScale
=
scaleFactor
[
0
].
y
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numPairs
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
atom1
=
pairs
[
index
].
x
;
int
atom2
=
pairs
[
index
].
y
;
mixed4
v1
=
velm
[
atom1
];
mixed4
v2
=
velm
[
atom2
];
mixed
m1
=
v1
.
w
==
0
?
0
:
1
/
v1
.
w
;
mixed
m2
=
v2
.
w
==
0
?
0
:
1
/
v2
.
w
;
mixed4
cv
;
cv
.
x
=
(
m1
*
v1
.
x
+
m2
*
v2
.
x
)
/
(
m1
+
m2
);
cv
.
y
=
(
m1
*
v1
.
y
+
m2
*
v2
.
y
)
/
(
m1
+
m2
);
cv
.
z
=
(
m1
*
v1
.
z
+
m2
*
v2
.
z
)
/
(
m1
+
m2
);
mixed4
rv
;
rv
.
x
=
v2
.
x
-
v1
.
x
;
rv
.
y
=
v2
.
y
-
v1
.
y
;
rv
.
z
=
v2
.
z
-
v1
.
z
;
v1
.
x
=
absScale
*
cv
.
x
-
relScale
*
rv
.
x
*
m2
/
(
m1
+
m2
);
v1
.
y
=
absScale
*
cv
.
y
-
relScale
*
rv
.
y
*
m2
/
(
m1
+
m2
);
v1
.
z
=
absScale
*
cv
.
z
-
relScale
*
rv
.
z
*
m2
/
(
m1
+
m2
);
v2
.
x
=
absScale
*
cv
.
x
+
relScale
*
rv
.
x
*
m1
/
(
m1
+
m2
);
v2
.
y
=
absScale
*
cv
.
y
+
relScale
*
rv
.
y
*
m1
/
(
m1
+
m2
);
v2
.
z
=
absScale
*
cv
.
z
+
relScale
*
rv
.
z
*
m1
/
(
m1
+
m2
);
velm
[
atom1
]
=
v1
;
velm
[
atom2
]
=
v2
;
}
}
/**
* Sum the energy buffer containing a pair of energies stored as mixed2. This is copied from utilities.cu with small modifications
*/
extern
"C"
__global__
void
reduceEnergyPair
(
const
mixed2
*
__restrict__
energyBuffer
,
mixed2
*
__restrict__
result
,
int
bufferSize
,
int
workGroupSize
)
{
__shared__
mixed2
tempBuffer
[
WORK_GROUP_SIZE
];
const
unsigned
int
thread
=
threadIdx
.
x
;
mixed2
sum
=
make_mixed2
(
0
,
0
);
for
(
unsigned
int
idx
=
thread
;
idx
<
bufferSize
;
idx
+=
blockDim
.
x
)
{
sum
.
x
+=
energyBuffer
[
idx
].
x
;
sum
.
y
+=
energyBuffer
[
idx
].
y
;
}
tempBuffer
[
thread
]
=
sum
;
for
(
int
i
=
1
;
i
<
workGroupSize
;
i
*=
2
)
{
__syncthreads
();
if
(
thread
%
(
i
*
2
)
==
0
&&
thread
+
i
<
workGroupSize
)
{
tempBuffer
[
thread
].
x
+=
tempBuffer
[
thread
+
i
].
x
;
tempBuffer
[
thread
].
y
+=
tempBuffer
[
thread
+
i
].
y
;
}
}
if
(
thread
==
0
)
*
result
=
tempBuffer
[
0
];
}
platforms/cuda/src/kernels/removeCM.cu
deleted
100644 → 0
View file @
8dd60914
/**
* Calculate the center of mass momentum.
*/
extern
"C"
__global__
void
calcCenterOfMassMomentum
(
int
numAtoms
,
const
mixed4
*
__restrict__
velm
,
float4
*
__restrict__
cmMomentum
)
{
extern
__shared__
volatile
float3
temp
[];
float3
cm
=
make_float3
(
0
,
0
,
0
);
for
(
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numAtoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
mixed4
velocity
=
velm
[
index
];
if
(
velocity
.
w
!=
0
)
{
mixed
mass
=
RECIP
(
velocity
.
w
);
cm
.
x
+=
(
float
)
(
velocity
.
x
*
mass
);
cm
.
y
+=
(
float
)
(
velocity
.
y
*
mass
);
cm
.
z
+=
(
float
)
(
velocity
.
z
*
mass
);
}
}
// Sum the threads in this group.
int
thread
=
threadIdx
.
x
;
temp
[
thread
].
x
=
cm
.
x
;
temp
[
thread
].
y
=
cm
.
y
;
temp
[
thread
].
z
=
cm
.
z
;
__syncthreads
();
if
(
thread
<
32
)
{
temp
[
thread
].
x
+=
temp
[
thread
+
32
].
x
;
temp
[
thread
].
y
+=
temp
[
thread
+
32
].
y
;
temp
[
thread
].
z
+=
temp
[
thread
+
32
].
z
;
if
(
thread
<
16
)
{
temp
[
thread
].
x
+=
temp
[
thread
+
16
].
x
;
temp
[
thread
].
y
+=
temp
[
thread
+
16
].
y
;
temp
[
thread
].
z
+=
temp
[
thread
+
16
].
z
;
}
if
(
thread
<
8
)
{
temp
[
thread
].
x
+=
temp
[
thread
+
8
].
x
;
temp
[
thread
].
y
+=
temp
[
thread
+
8
].
y
;
temp
[
thread
].
z
+=
temp
[
thread
+
8
].
z
;
}
if
(
thread
<
4
)
{
temp
[
thread
].
x
+=
temp
[
thread
+
4
].
x
;
temp
[
thread
].
y
+=
temp
[
thread
+
4
].
y
;
temp
[
thread
].
z
+=
temp
[
thread
+
4
].
z
;
}
if
(
thread
<
2
)
{
temp
[
thread
].
x
+=
temp
[
thread
+
2
].
x
;
temp
[
thread
].
y
+=
temp
[
thread
+
2
].
y
;
temp
[
thread
].
z
+=
temp
[
thread
+
2
].
z
;
}
}
if
(
thread
==
0
)
{
float3
sum
=
make_float3
(
temp
[
thread
].
x
+
temp
[
thread
+
1
].
x
,
temp
[
thread
].
y
+
temp
[
thread
+
1
].
y
,
temp
[
thread
].
z
+
temp
[
thread
+
1
].
z
);
cmMomentum
[
blockIdx
.
x
]
=
make_float4
(
sum
.
x
,
sum
.
y
,
sum
.
z
,
0.0
f
);
}
}
/**
* Remove center of mass motion.
*/
extern
"C"
__global__
void
removeCenterOfMassMomentum
(
unsigned
int
numAtoms
,
mixed4
*
__restrict__
velm
,
const
float4
*
__restrict__
cmMomentum
)
{
// First sum all of the momenta that were calculated by individual groups.
extern
volatile
float3
temp
[];
float3
cm
=
make_float3
(
0
,
0
,
0
);
for
(
unsigned
int
index
=
threadIdx
.
x
;
index
<
gridDim
.
x
;
index
+=
blockDim
.
x
)
{
float4
momentum
=
cmMomentum
[
index
];
cm
.
x
+=
momentum
.
x
;
cm
.
y
+=
momentum
.
y
;
cm
.
z
+=
momentum
.
z
;
}
int
thread
=
threadIdx
.
x
;
temp
[
thread
].
x
=
cm
.
x
;
temp
[
thread
].
y
=
cm
.
y
;
temp
[
thread
].
z
=
cm
.
z
;
__syncthreads
();
if
(
thread
<
32
)
{
temp
[
thread
].
x
+=
temp
[
thread
+
32
].
x
;
temp
[
thread
].
y
+=
temp
[
thread
+
32
].
y
;
temp
[
thread
].
z
+=
temp
[
thread
+
32
].
z
;
if
(
thread
<
16
)
{
temp
[
thread
].
x
+=
temp
[
thread
+
16
].
x
;
temp
[
thread
].
y
+=
temp
[
thread
+
16
].
y
;
temp
[
thread
].
z
+=
temp
[
thread
+
16
].
z
;
}
if
(
thread
<
8
)
{
temp
[
thread
].
x
+=
temp
[
thread
+
8
].
x
;
temp
[
thread
].
y
+=
temp
[
thread
+
8
].
y
;
temp
[
thread
].
z
+=
temp
[
thread
+
8
].
z
;
}
if
(
thread
<
4
)
{
temp
[
thread
].
x
+=
temp
[
thread
+
4
].
x
;
temp
[
thread
].
y
+=
temp
[
thread
+
4
].
y
;
temp
[
thread
].
z
+=
temp
[
thread
+
4
].
z
;
}
if
(
thread
<
2
)
{
temp
[
thread
].
x
+=
temp
[
thread
+
2
].
x
;
temp
[
thread
].
y
+=
temp
[
thread
+
2
].
y
;
temp
[
thread
].
z
+=
temp
[
thread
+
2
].
z
;
}
}
__syncthreads
();
cm
=
make_float3
(
INVERSE_TOTAL_MASS
*
(
temp
[
0
].
x
+
temp
[
1
].
x
),
INVERSE_TOTAL_MASS
*
(
temp
[
0
].
y
+
temp
[
1
].
y
),
INVERSE_TOTAL_MASS
*
(
temp
[
0
].
z
+
temp
[
1
].
z
));
// Now remove the center of mass velocity from each atom.
for
(
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numAtoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
mixed4
velocity
=
velm
[
index
];
velocity
.
x
-=
cm
.
x
;
velocity
.
y
-=
cm
.
y
;
velocity
.
z
-=
cm
.
z
;
velm
[
index
]
=
velocity
;
}
}
platforms/cuda/src/kernels/utilities.cu
View file @
5a06df78
...
@@ -99,4 +99,4 @@ __global__ void setCharges(real* __restrict__ charges, real4* __restrict__ posq,
...
@@ -99,4 +99,4 @@ __global__ void setCharges(real* __restrict__ charges, real4* __restrict__ posq,
for
(
int
i
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
i
<
numAtoms
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
for
(
int
i
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
i
<
numAtoms
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
posq
[
i
].
w
=
charges
[
atomOrder
[
i
]];
posq
[
i
].
w
=
charges
[
atomOrder
[
i
]];
}
}
}
}
\ No newline at end of file
platforms/cuda/src/kernels/vectorOps.cu
View file @
5a06df78
...
@@ -554,16 +554,16 @@ inline __device__ float3 cross(float3 a, float3 b) {
...
@@ -554,16 +554,16 @@ inline __device__ float3 cross(float3 a, float3 b) {
return
make_float3
(
a
.
y
*
b
.
z
-
a
.
z
*
b
.
y
,
a
.
z
*
b
.
x
-
a
.
x
*
b
.
z
,
a
.
x
*
b
.
y
-
a
.
y
*
b
.
x
);
return
make_float3
(
a
.
y
*
b
.
z
-
a
.
z
*
b
.
y
,
a
.
z
*
b
.
x
-
a
.
x
*
b
.
z
,
a
.
x
*
b
.
y
-
a
.
y
*
b
.
x
);
}
}
inline
__device__
float
3
cross
(
float4
a
,
float4
b
)
{
inline
__device__
float
4
cross
(
float4
a
,
float4
b
)
{
return
make_float
3
(
a
.
y
*
b
.
z
-
a
.
z
*
b
.
y
,
a
.
z
*
b
.
x
-
a
.
x
*
b
.
z
,
a
.
x
*
b
.
y
-
a
.
y
*
b
.
x
);
return
make_float
4
(
a
.
y
*
b
.
z
-
a
.
z
*
b
.
y
,
a
.
z
*
b
.
x
-
a
.
x
*
b
.
z
,
a
.
x
*
b
.
y
-
a
.
y
*
b
.
x
,
0.0
f
);
}
}
inline
__device__
double3
cross
(
double3
a
,
double3
b
)
{
inline
__device__
double3
cross
(
double3
a
,
double3
b
)
{
return
make_double3
(
a
.
y
*
b
.
z
-
a
.
z
*
b
.
y
,
a
.
z
*
b
.
x
-
a
.
x
*
b
.
z
,
a
.
x
*
b
.
y
-
a
.
y
*
b
.
x
);
return
make_double3
(
a
.
y
*
b
.
z
-
a
.
z
*
b
.
y
,
a
.
z
*
b
.
x
-
a
.
x
*
b
.
z
,
a
.
x
*
b
.
y
-
a
.
y
*
b
.
x
);
}
}
inline
__device__
double
3
cross
(
double4
a
,
double4
b
)
{
inline
__device__
double
4
cross
(
double4
a
,
double4
b
)
{
return
make_double
3
(
a
.
y
*
b
.
z
-
a
.
z
*
b
.
y
,
a
.
z
*
b
.
x
-
a
.
x
*
b
.
z
,
a
.
x
*
b
.
y
-
a
.
y
*
b
.
x
);
return
make_double
4
(
a
.
y
*
b
.
z
-
a
.
z
*
b
.
y
,
a
.
z
*
b
.
x
-
a
.
x
*
b
.
z
,
a
.
x
*
b
.
y
-
a
.
y
*
b
.
x
,
0.0
);
}
}
// Normalize a vector
// Normalize a vector
...
@@ -594,6 +594,14 @@ inline __device__ double4 normalize(double4 a) {
...
@@ -594,6 +594,14 @@ inline __device__ double4 normalize(double4 a) {
// Strip off the fourth component of a vector.
// Strip off the fourth component of a vector.
inline
__device__
short3
trimTo3
(
short4
v
)
{
return
make_short3
(
v
.
x
,
v
.
y
,
v
.
z
);
}
inline
__device__
int3
trimTo3
(
int4
v
)
{
return
make_int3
(
v
.
x
,
v
.
y
,
v
.
z
);
}
inline
__device__
float3
trimTo3
(
float4
v
)
{
inline
__device__
float3
trimTo3
(
float4
v
)
{
return
make_float3
(
v
.
x
,
v
.
y
,
v
.
z
);
return
make_float3
(
v
.
x
,
v
.
y
,
v
.
z
);
}
}
...
...
platforms/cuda/src/kernels/velocityVerlet.cu
0 → 100644
View file @
5a06df78
/**
* Perform the first step of Velocity Verlet integration.
*
* update displacements (posDelta) and velocities (velm)
*/
extern
"C"
__global__
void
integrateVelocityVerletPart1
(
int
numAtoms
,
int
numPairs
,
int
paddedNumAtoms
,
const
mixed2
*
__restrict__
dt
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
posqCorrection
,
mixed4
*
__restrict__
velm
,
const
long
long
*
__restrict__
force
,
mixed4
*
__restrict__
posDelta
,
const
int
*
__restrict__
atomList
,
const
int2
*
__restrict__
pairList
)
{
const
mixed2
stepSize
=
dt
[
0
];
const
mixed
dtPos
=
stepSize
.
y
;
const
mixed
dtVel
=
0.5
f
*
(
stepSize
.
x
+
stepSize
.
y
);
const
mixed
scale
=
0.5
f
*
dtVel
/
(
mixed
)
0x100000000
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numAtoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
atom
=
atomList
[
index
];
mixed4
velocity
=
velm
[
atom
];
if
(
velocity
.
w
!=
0.0
)
{
#ifdef USE_MIXED_PRECISION
real4
pos1
=
posq
[
atom
];
real4
pos2
=
posqCorrection
[
atom
];
mixed4
pos
=
make_mixed4
(
pos1
.
x
+
(
mixed
)
pos2
.
x
,
pos1
.
y
+
(
mixed
)
pos2
.
y
,
pos1
.
z
+
(
mixed
)
pos2
.
z
,
pos1
.
w
);
#else
real4
pos
=
posq
[
atom
];
#endif
velocity
.
x
+=
scale
*
force
[
atom
]
*
velocity
.
w
;
velocity
.
y
+=
scale
*
force
[
atom
+
paddedNumAtoms
]
*
velocity
.
w
;
velocity
.
z
+=
scale
*
force
[
atom
+
paddedNumAtoms
*
2
]
*
velocity
.
w
;
pos
.
x
=
velocity
.
x
*
dtPos
;
pos
.
y
=
velocity
.
y
*
dtPos
;
pos
.
z
=
velocity
.
z
*
dtPos
;
posDelta
[
atom
]
=
pos
;
velm
[
atom
]
=
velocity
;
}
}
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numPairs
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
atom1
=
pairList
[
index
].
x
;
int
atom2
=
pairList
[
index
].
y
;
mixed4
v1
=
velm
[
atom1
];
mixed4
v2
=
velm
[
atom2
];
mixed
m1
=
v1
.
w
==
0.0
f
?
0.0
f
:
1.0
f
/
v1
.
w
;
mixed
m2
=
v2
.
w
==
0.0
f
?
0.0
f
:
1.0
f
/
v2
.
w
;
mixed
mass1fract
=
m1
/
(
m1
+
m2
);
mixed
mass2fract
=
m2
/
(
m1
+
m2
);
mixed
invRedMass
=
(
m1
*
m2
!=
0.0
f
)
?
(
m1
+
m2
)
/
(
m1
*
m2
)
:
0.0
f
;
mixed
invTotMass
=
(
m1
+
m2
!=
0.0
f
)
?
1.0
f
/
(
m1
+
m2
)
:
0.0
f
;
mixed3
comVel
;
comVel
.
x
=
v1
.
x
*
mass1fract
+
v2
.
x
*
mass2fract
;
comVel
.
y
=
v1
.
y
*
mass1fract
+
v2
.
y
*
mass2fract
;
comVel
.
z
=
v1
.
z
*
mass1fract
+
v2
.
z
*
mass2fract
;
mixed3
relVel
;
relVel
.
x
=
v2
.
x
-
v1
.
x
;
relVel
.
y
=
v2
.
y
-
v1
.
y
;
relVel
.
z
=
v2
.
z
-
v1
.
z
;
//
mixed3
comFrc
;
comFrc
.
x
=
force
[
atom1
]
+
force
[
atom2
];
comFrc
.
y
=
force
[
atom1
+
paddedNumAtoms
]
+
force
[
atom2
+
paddedNumAtoms
];
comFrc
.
z
=
force
[
atom1
+
paddedNumAtoms
*
2
]
+
force
[
atom2
+
paddedNumAtoms
*
2
];
mixed3
relFrc
;
relFrc
.
x
=
mass1fract
*
force
[
atom2
]
-
mass2fract
*
force
[
atom1
];
relFrc
.
y
=
mass1fract
*
force
[
atom2
+
paddedNumAtoms
]
-
mass2fract
*
force
[
atom1
+
paddedNumAtoms
];
relFrc
.
z
=
mass1fract
*
force
[
atom2
+
paddedNumAtoms
*
2
]
-
mass2fract
*
force
[
atom1
+
paddedNumAtoms
*
2
];
comVel
.
x
+=
comFrc
.
x
*
scale
*
invTotMass
;
comVel
.
y
+=
comFrc
.
y
*
scale
*
invTotMass
;
comVel
.
z
+=
comFrc
.
z
*
scale
*
invTotMass
;
relVel
.
x
+=
relFrc
.
x
*
scale
*
invRedMass
;
relVel
.
y
+=
relFrc
.
y
*
scale
*
invRedMass
;
relVel
.
z
+=
relFrc
.
z
*
scale
*
invRedMass
;
#ifdef USE_MIXED_PRECISION
real4
posv1
=
posq
[
atom1
];
real4
posv2
=
posq
[
atom2
];
real4
posc1
=
posqCorrection
[
atom1
];
real4
posc2
=
posqCorrection
[
atom2
];
mixed4
pos1
=
make_mixed4
(
posv1
.
x
+
(
mixed
)
posc1
.
x
,
posv1
.
y
+
(
mixed
)
posc1
.
y
,
posv1
.
z
+
(
mixed
)
posc1
.
z
,
posv1
.
w
);
mixed4
pos2
=
make_mixed4
(
posv2
.
x
+
(
mixed
)
posc2
.
x
,
posv2
.
y
+
(
mixed
)
posc2
.
y
,
posv2
.
z
+
(
mixed
)
posc2
.
z
,
posv2
.
w
);
#else
real4
pos1
=
posq
[
atom1
];
real4
pos2
=
posq
[
atom2
];
#endif
if
(
v1
.
w
!=
0.0
f
)
{
v1
.
x
=
comVel
.
x
-
relVel
.
x
*
mass2fract
;
v1
.
y
=
comVel
.
y
-
relVel
.
y
*
mass2fract
;
v1
.
z
=
comVel
.
z
-
relVel
.
z
*
mass2fract
;
pos1
.
x
=
v1
.
x
*
dtPos
;
pos1
.
y
=
v1
.
y
*
dtPos
;
pos1
.
z
=
v1
.
z
*
dtPos
;
posDelta
[
atom1
]
=
pos1
;
velm
[
atom1
]
=
v1
;
}
if
(
v2
.
w
!=
0.0
f
)
{
v2
.
x
=
comVel
.
x
+
relVel
.
x
*
mass1fract
;
v2
.
y
=
comVel
.
y
+
relVel
.
y
*
mass1fract
;
v2
.
z
=
comVel
.
z
+
relVel
.
z
*
mass1fract
;
pos2
.
x
=
v2
.
x
*
dtPos
;
pos2
.
y
=
v2
.
y
*
dtPos
;
pos2
.
z
=
v2
.
z
*
dtPos
;
posDelta
[
atom2
]
=
pos2
;
velm
[
atom2
]
=
v2
;
}
}
}
/**
* Perform the second step of Velocity Verlet integration.
*
* apply displacements to positions (posq) after constraints have been enforced
*/
extern
"C"
__global__
void
integrateVelocityVerletPart2
(
int
numAtoms
,
mixed2
*
__restrict__
dt
,
real4
*
__restrict__
posq
,
real4
*
__restrict__
posqCorrection
,
mixed4
*
__restrict__
velm
,
const
mixed4
*
__restrict__
posDelta
)
{
mixed2
stepSize
=
dt
[
0
];
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
==
0
)
dt
[
0
].
x
=
stepSize
.
y
;
for
(;
index
<
numAtoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
mixed4
velocity
=
velm
[
index
];
if
(
velocity
.
w
!=
0.0
)
{
#ifdef USE_MIXED_PRECISION
real4
pos1
=
posq
[
index
];
real4
pos2
=
posqCorrection
[
index
];
mixed4
pos
=
make_mixed4
(
pos1
.
x
+
(
mixed
)
pos2
.
x
,
pos1
.
y
+
(
mixed
)
pos2
.
y
,
pos1
.
z
+
(
mixed
)
pos2
.
z
,
pos1
.
w
);
#else
real4
pos
=
posq
[
index
];
#endif
mixed4
delta
=
posDelta
[
index
];
pos
.
x
+=
delta
.
x
;
pos
.
y
+=
delta
.
y
;
pos
.
z
+=
delta
.
z
;
#ifdef USE_MIXED_PRECISION
posq
[
index
]
=
make_real4
((
real
)
pos
.
x
,
(
real
)
pos
.
y
,
(
real
)
pos
.
z
,
(
real
)
pos
.
w
);
posqCorrection
[
index
]
=
make_real4
(
pos
.
x
-
(
real
)
pos
.
x
,
pos
.
y
-
(
real
)
pos
.
y
,
pos
.
z
-
(
real
)
pos
.
z
,
0
);
#else
posq
[
index
]
=
pos
;
#endif
}
}
}
/**
* Perform the third step of Velocity Verlet integration.
*
* modify the velocities (velm) after the force update
*/
extern
"C"
__global__
void
integrateVelocityVerletPart3
(
int
numAtoms
,
int
numPairs
,
int
paddedNumAtoms
,
mixed2
*
__restrict__
dt
,
real4
*
__restrict__
posq
,
real4
*
__restrict__
posqCorrection
,
mixed4
*
__restrict__
velm
,
const
long
long
*
__restrict__
force
,
const
mixed4
*
__restrict__
posDelta
,
const
int
*
__restrict__
atomList
,
const
int2
*
__restrict__
pairList
)
{
mixed2
stepSize
=
dt
[
0
];
#if __CUDA_ARCH__ >= 130
double
oneOverDt
=
1.0
/
stepSize
.
y
;
#else
float
oneOverDt
=
1.0
f
/
stepSize
.
y
;
float
correction
=
(
1.0
f
-
oneOverDt
*
stepSize
.
y
)
/
stepSize
.
y
;
#endif
const
mixed
dtVel
=
0.5
f
*
(
stepSize
.
x
+
stepSize
.
y
);
const
mixed
scale
=
0.5
f
*
dtVel
/
(
mixed
)
0x100000000
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
==
0
)
dt
[
0
].
x
=
stepSize
.
y
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numAtoms
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
atom
=
atomList
[
index
];
mixed4
velocity
=
velm
[
atom
];
if
(
velocity
.
w
!=
0.0
)
{
mixed4
deltaXconstrained
=
posDelta
[
atom
];
velocity
.
x
+=
scale
*
force
[
atom
]
*
velocity
.
w
+
(
deltaXconstrained
.
x
-
velocity
.
x
*
stepSize
.
y
)
*
oneOverDt
;
velocity
.
y
+=
scale
*
force
[
atom
+
paddedNumAtoms
]
*
velocity
.
w
+
(
deltaXconstrained
.
y
-
velocity
.
y
*
stepSize
.
y
)
*
oneOverDt
;
velocity
.
z
+=
scale
*
force
[
atom
+
paddedNumAtoms
*
2
]
*
velocity
.
w
+
(
deltaXconstrained
.
z
-
velocity
.
z
*
stepSize
.
y
)
*
oneOverDt
;
#if __CUDA_ARCH__ < 130
velocity
.
x
+=
(
deltaXconstrained
.
x
-
velocity
.
x
*
stepSize
.
y
)
*
correction
;
velocity
.
y
+=
(
deltaXconstrained
.
y
-
velocity
.
y
*
stepSize
.
y
)
*
correction
;
velocity
.
z
+=
(
deltaXconstrained
.
z
-
velocity
.
z
*
stepSize
.
y
)
*
correction
;
#endif
velm
[
atom
]
=
velocity
;
}
}
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numPairs
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
atom1
=
pairList
[
index
].
x
;
int
atom2
=
pairList
[
index
].
y
;
mixed4
v1
=
velm
[
atom1
];
mixed4
v2
=
velm
[
atom2
];
mixed
m1
=
v1
.
w
==
0.0
f
?
0.0
f
:
1.0
f
/
v1
.
w
;
mixed
m2
=
v2
.
w
==
0.0
f
?
0.0
f
:
1.0
f
/
v2
.
w
;
mixed
mass1fract
=
m1
/
(
m1
+
m2
);
mixed
mass2fract
=
m2
/
(
m1
+
m2
);
mixed
invRedMass
=
(
m1
*
m2
!=
0.0
f
)
?
(
m1
+
m2
)
/
(
m1
*
m2
)
:
0.0
f
;
mixed
invTotMass
=
(
m1
+
m2
!=
0.0
f
)
?
1.0
f
/
(
m1
+
m2
)
:
0.0
f
;
mixed3
comVel
;
comVel
.
x
=
v1
.
x
*
mass1fract
+
v2
.
x
*
mass2fract
;
comVel
.
y
=
v1
.
y
*
mass1fract
+
v2
.
y
*
mass2fract
;
comVel
.
z
=
v1
.
z
*
mass1fract
+
v2
.
z
*
mass2fract
;
mixed3
relVel
;
relVel
.
x
=
v2
.
x
-
v1
.
x
;
relVel
.
y
=
v2
.
y
-
v1
.
y
;
relVel
.
z
=
v2
.
z
-
v1
.
z
;
//
mixed3
comFrc
;
comFrc
.
x
=
force
[
atom1
]
+
force
[
atom2
];
comFrc
.
y
=
force
[
atom1
+
paddedNumAtoms
]
+
force
[
atom2
+
paddedNumAtoms
];
comFrc
.
z
=
force
[
atom1
+
paddedNumAtoms
*
2
]
+
force
[
atom2
+
paddedNumAtoms
*
2
];
mixed3
relFrc
;
relFrc
.
x
=
mass1fract
*
force
[
atom2
]
-
mass2fract
*
force
[
atom1
];
relFrc
.
y
=
mass1fract
*
force
[
atom2
+
paddedNumAtoms
]
-
mass2fract
*
force
[
atom1
+
paddedNumAtoms
];
relFrc
.
z
=
mass1fract
*
force
[
atom2
+
paddedNumAtoms
*
2
]
-
mass2fract
*
force
[
atom1
+
paddedNumAtoms
*
2
];
comVel
.
x
+=
comFrc
.
x
*
scale
*
invTotMass
;
comVel
.
y
+=
comFrc
.
y
*
scale
*
invTotMass
;
comVel
.
z
+=
comFrc
.
z
*
scale
*
invTotMass
;
relVel
.
x
+=
relFrc
.
x
*
scale
*
invRedMass
;
relVel
.
y
+=
relFrc
.
y
*
scale
*
invRedMass
;
relVel
.
z
+=
relFrc
.
z
*
scale
*
invRedMass
;
if
(
v1
.
w
!=
0.0
f
)
{
mixed4
deltaXconstrained
=
posDelta
[
atom1
];
v1
.
x
=
comVel
.
x
-
relVel
.
x
*
mass2fract
+
(
deltaXconstrained
.
x
-
v1
.
x
*
stepSize
.
y
)
*
oneOverDt
;
v1
.
y
=
comVel
.
y
-
relVel
.
y
*
mass2fract
+
(
deltaXconstrained
.
y
-
v1
.
y
*
stepSize
.
y
)
*
oneOverDt
;
v1
.
z
=
comVel
.
z
-
relVel
.
z
*
mass2fract
+
(
deltaXconstrained
.
z
-
v1
.
z
*
stepSize
.
y
)
*
oneOverDt
;
#if __CUDA_ARCH__ < 130
v1
.
x
+=
(
deltaXconstrained
.
x
-
v1
.
x
*
stepSize
.
y
)
*
correction
;
v1
.
y
+=
(
deltaXconstrained
.
y
-
v1
.
y
*
stepSize
.
y
)
*
correction
;
v1
.
z
+=
(
deltaXconstrained
.
z
-
v1
.
z
*
stepSize
.
y
)
*
correction
;
#endif
velm
[
atom1
]
=
v1
;
}
if
(
v2
.
w
!=
0.0
f
)
{
mixed4
deltaXconstrained
=
posDelta
[
atom2
];
v2
.
x
=
comVel
.
x
+
relVel
.
x
*
mass1fract
+
(
deltaXconstrained
.
x
-
v2
.
x
*
stepSize
.
y
)
*
oneOverDt
;
v2
.
y
=
comVel
.
y
+
relVel
.
y
*
mass1fract
+
(
deltaXconstrained
.
y
-
v2
.
y
*
stepSize
.
y
)
*
oneOverDt
;
v2
.
z
=
comVel
.
z
+
relVel
.
z
*
mass1fract
+
(
deltaXconstrained
.
z
-
v2
.
z
*
stepSize
.
y
)
*
oneOverDt
;
#if __CUDA_ARCH__ < 130
v2
.
x
+=
(
deltaXconstrained
.
x
-
v2
.
x
*
stepSize
.
y
)
*
correction
;
v2
.
y
+=
(
deltaXconstrained
.
y
-
v2
.
y
*
stepSize
.
y
)
*
correction
;
v2
.
z
+=
(
deltaXconstrained
.
z
-
v2
.
z
*
stepSize
.
y
)
*
correction
;
#endif
velm
[
atom2
]
=
v2
;
}
}
}
/**
* Apply the hard wall constraint
*/
extern
"C"
__global__
void
integrateVelocityVerletHardWall
(
int
numPairs
,
const
float
*
__restrict__
maxPairDistance
,
mixed2
*
__restrict__
dt
,
real4
*
__restrict__
posq
,
real4
*
__restrict__
posqCorrection
,
mixed4
*
__restrict__
velm
,
const
int2
*
__restrict__
pairList
,
const
float
*
__restrict__
pairTemperature
)
{
mixed
dtPos
=
dt
[
0
].
y
;
mixed
maxDelta
=
(
mixed
)
maxPairDistance
[
0
];
// Apply hard wall constraints.
if
(
maxDelta
>
0
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
numPairs
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
const
mixed
hardWallScale
=
sqrt
(
((
mixed
)
pairTemperature
[
index
])
*
((
mixed
)
BOLTZ
));
int2
atom
=
make_int2
(
pairList
[
index
].
x
,
pairList
[
index
].
y
);
#ifdef USE_MIXED_PRECISION
real4
posv1
=
posq
[
atom
.
x
];
real4
posc1
=
posqCorrection
[
atom
.
x
];
mixed4
pos1
=
make_mixed4
(
posv1
.
x
+
(
mixed
)
posc1
.
x
,
posv1
.
y
+
(
mixed
)
posc1
.
y
,
posv1
.
z
+
(
mixed
)
posc1
.
z
,
posv1
.
w
);
real4
posv2
=
posq
[
atom
.
y
];
real4
posc2
=
posqCorrection
[
atom
.
y
];
mixed4
pos2
=
make_mixed4
(
posv2
.
x
+
(
mixed
)
posc2
.
x
,
posv2
.
y
+
(
mixed
)
posc2
.
y
,
posv2
.
z
+
(
mixed
)
posc2
.
z
,
posv2
.
w
);
#else
real4
pos1
=
posq
[
atom
.
x
];
real4
pos2
=
posq
[
atom
.
y
];
#endif
mixed3
delta
=
make_mixed3
(
mixed
(
pos1
.
x
-
pos2
.
x
),
mixed
(
pos1
.
y
-
pos2
.
y
),
mixed
(
pos1
.
z
-
pos2
.
z
)
);
mixed
r
=
sqrt
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
);
mixed
rInv
=
1
/
r
;
if
(
rInv
*
maxDelta
<
1.0
)
{
// The constraint has been violated, so make the inter-particle distance "bounce"
// off the hard wall.
mixed3
bondDir
=
make_mixed3
(
delta
.
x
*
rInv
,
delta
.
y
*
rInv
,
delta
.
z
*
rInv
);
mixed3
vel1
=
make_mixed3
(
velm
[
atom
.
x
].
x
,
velm
[
atom
.
x
].
y
,
velm
[
atom
.
x
].
z
);
mixed3
vel2
=
make_mixed3
(
velm
[
atom
.
y
].
x
,
velm
[
atom
.
y
].
y
,
velm
[
atom
.
y
].
z
);
mixed
m1
=
velm
[
atom
.
x
].
w
!=
0.0
?
1.0
/
velm
[
atom
.
x
].
w
:
0.0
;
mixed
m2
=
velm
[
atom
.
y
].
w
!=
0.0
?
1.0
/
velm
[
atom
.
y
].
w
:
0.0
;
mixed
invTotMass
=
(
m1
+
m2
!=
0.0
)
?
1.0
/
(
m1
+
m2
)
:
0.0
;
mixed
deltaR
=
r
-
maxDelta
;
mixed
deltaT
=
dtPos
;
mixed
dt
=
dtPos
;
mixed
dotvr1
=
vel1
.
x
*
bondDir
.
x
+
vel1
.
y
*
bondDir
.
y
+
vel1
.
z
*
bondDir
.
z
;
mixed3
vb1
=
make_mixed3
(
bondDir
.
x
*
dotvr1
,
bondDir
.
y
*
dotvr1
,
bondDir
.
z
*
dotvr1
);
mixed3
vp1
=
make_mixed3
(
vel1
.
x
-
vb1
.
x
,
vel1
.
y
-
vb1
.
y
,
vel1
.
z
-
vb1
.
z
);
if
(
m2
==
0
)
{
// The parent particle is massless, so move only the Drude particle.
if
(
dotvr1
!=
0.0
)
deltaT
=
deltaR
/
fabs
(
dotvr1
);
if
(
deltaT
>
dtPos
)
deltaT
=
dtPos
;
dotvr1
=
-
dotvr1
*
hardWallScale
/
(
fabs
(
dotvr1
)
*
sqrt
(
m1
));
mixed
dr
=
-
deltaR
+
deltaT
*
dotvr1
;
pos1
.
x
+=
bondDir
.
x
*
dr
;
pos1
.
y
+=
bondDir
.
y
*
dr
;
pos1
.
z
+=
bondDir
.
z
*
dr
;
velm
[
atom
.
x
]
=
make_mixed4
(
vp1
.
x
+
bondDir
.
x
*
dotvr1
,
vp1
.
y
+
bondDir
.
y
*
dotvr1
,
vp1
.
z
+
bondDir
.
z
*
dotvr1
,
velm
[
atom
.
x
].
w
);
#ifdef USE_MIXED_PRECISION
posq
[
atom
.
x
]
=
make_real4
((
real
)
pos1
.
x
,
(
real
)
pos1
.
y
,
(
real
)
pos1
.
z
,
(
real
)
pos1
.
w
);
posqCorrection
[
atom
.
x
]
=
make_real4
(
pos1
.
x
-
(
real
)
pos1
.
x
,
pos1
.
y
-
(
real
)
pos1
.
y
,
pos1
.
z
-
(
real
)
pos1
.
z
,
0
);
#else
posq
[
atom
.
x
]
=
pos1
;
#endif
}
else
{
// Move both particles.
mixed
dotvr2
=
vel2
.
x
*
bondDir
.
x
+
vel2
.
y
*
bondDir
.
y
+
vel2
.
z
*
bondDir
.
z
;
mixed3
vb2
=
make_mixed3
(
bondDir
.
x
*
dotvr2
,
bondDir
.
y
*
dotvr2
,
bondDir
.
z
*
dotvr2
);
mixed3
vp2
=
make_mixed3
(
vel2
.
x
-
vb2
.
x
,
vel2
.
y
-
vb2
.
y
,
vel2
.
z
-
vb2
.
z
);
mixed
vbCMass
=
(
m1
*
dotvr1
+
m2
*
dotvr2
)
*
invTotMass
;
dotvr1
-=
vbCMass
;
dotvr2
-=
vbCMass
;
if
(
dotvr1
!=
dotvr2
)
deltaT
=
deltaR
/
fabs
(
dotvr1
-
dotvr2
);
if
(
deltaT
>
dt
)
deltaT
=
dt
;
mixed
vBond
=
hardWallScale
/
sqrt
(
m1
);
dotvr1
=
-
dotvr1
*
vBond
*
m2
*
invTotMass
/
fabs
(
dotvr1
);
dotvr2
=
-
dotvr2
*
vBond
*
m1
*
invTotMass
/
fabs
(
dotvr2
);
mixed
dr1
=
-
deltaR
*
m2
*
invTotMass
+
deltaT
*
dotvr1
;
mixed
dr2
=
deltaR
*
m1
*
invTotMass
+
deltaT
*
dotvr2
;
dotvr1
+=
vbCMass
;
dotvr2
+=
vbCMass
;
pos1
.
x
+=
bondDir
.
x
*
dr1
;
pos1
.
y
+=
bondDir
.
y
*
dr1
;
pos1
.
z
+=
bondDir
.
z
*
dr1
;
pos2
.
x
+=
bondDir
.
x
*
dr2
;
pos2
.
y
+=
bondDir
.
y
*
dr2
;
pos2
.
z
+=
bondDir
.
z
*
dr2
;
velm
[
atom
.
x
]
=
make_mixed4
(
vp1
.
x
+
bondDir
.
x
*
dotvr1
,
vp1
.
y
+
bondDir
.
y
*
dotvr1
,
vp1
.
z
+
bondDir
.
z
*
dotvr1
,
velm
[
atom
.
x
].
w
);
velm
[
atom
.
y
]
=
make_mixed4
(
vp2
.
x
+
bondDir
.
x
*
dotvr2
,
vp2
.
y
+
bondDir
.
y
*
dotvr2
,
vp2
.
z
+
bondDir
.
z
*
dotvr2
,
velm
[
atom
.
y
].
w
);
#ifdef USE_MIXED_PRECISION
posq
[
atom
.
x
]
=
make_real4
((
real
)
pos1
.
x
,
(
real
)
pos1
.
y
,
(
real
)
pos1
.
z
,
(
real
)
pos1
.
w
);
posq
[
atom
.
y
]
=
make_real4
((
real
)
pos2
.
x
,
(
real
)
pos2
.
y
,
(
real
)
pos2
.
z
,
(
real
)
pos2
.
w
);
posqCorrection
[
atom
.
x
]
=
make_real4
(
pos1
.
x
-
(
real
)
pos1
.
x
,
pos1
.
y
-
(
real
)
pos1
.
y
,
pos1
.
z
-
(
real
)
pos1
.
z
,
0
);
posqCorrection
[
atom
.
y
]
=
make_real4
(
pos2
.
x
-
(
real
)
pos2
.
x
,
pos2
.
y
-
(
real
)
pos2
.
y
,
pos2
.
z
-
(
real
)
pos2
.
z
,
0
);
#else
posq
[
atom
.
x
]
=
pos1
;
posq
[
atom
.
y
]
=
pos2
;
#endif
}
}
}
}
/* end of hard wall constraint part */
}
platforms/cuda/staticTarget/CMakeLists.txt
View file @
5a06df78
...
@@ -4,17 +4,17 @@
...
@@ -4,17 +4,17 @@
INCLUDE
(
FindCUDA
)
INCLUDE
(
FindCUDA
)
INCLUDE_DIRECTORIES
(
${
CUDA_TOOLKIT_INCLUDE
}
)
INCLUDE_DIRECTORIES
(
${
CUDA_TOOLKIT_INCLUDE
}
)
FILE
(
GLOB CUDA_KERNELS
${
CUDA
_SOURCE_DIR
}
/kernels/*.cu
)
FILE
(
GLOB CUDA_KERNELS
${
KERNEL
_SOURCE_DIR
}
/kernels/*.cu
)
ADD_CUSTOM_COMMAND
(
OUTPUT
${
CUDA_
KERNELS_CPP
}
${
CUDA_
KERNELS_H
}
ADD_CUSTOM_COMMAND
(
OUTPUT
${
KERNELS_CPP
}
${
KERNELS_H
}
COMMAND
${
CMAKE_COMMAND
}
COMMAND
${
CMAKE_COMMAND
}
ARGS -D
CUDA
_SOURCE_DIR=
${
CUDA
_SOURCE_DIR
}
-D
CUDA_
KERNELS_CPP=
${
CUDA_
KERNELS_CPP
}
-D
CUDA_
KERNELS_H=
${
CUDA_
KERNELS_H
}
-D
CUDA
_SOURCE_CLASS=
${
CUDA
_SOURCE_CLASS
}
-P
${
CMAKE_
CURRENT_
SOURCE_DIR
}
/
..
/Encode
CUDA
Files.cmake
ARGS -D
KERNEL
_SOURCE_DIR=
${
KERNEL
_SOURCE_DIR
}
-D KERNELS_CPP=
${
KERNELS_CPP
}
-D KERNELS_H=
${
KERNELS_H
}
-D
KERNEL
_SOURCE_CLASS=
${
KERNEL
_SOURCE_CLASS
}
-D KERNEL_FILE_EXTENSION=cu
-P
${
CMAKE_SOURCE_DIR
}
/
cmake_modules
/Encode
Kernel
Files.cmake
DEPENDS
${
CUDA_KERNELS
}
DEPENDS
${
CUDA_KERNELS
}
)
)
SET_SOURCE_FILES_PROPERTIES
(
${
CUDA_
KERNELS_CPP
}
${
CUDA_
KERNELS_H
}
PROPERTIES GENERATED TRUE
)
SET_SOURCE_FILES_PROPERTIES
(
${
KERNELS_CPP
}
${
KERNELS_H
}
PROPERTIES GENERATED TRUE
)
ADD_LIBRARY
(
${
STATIC_TARGET
}
STATIC
${
SOURCE_FILES
}
${
SOURCE_INCLUDE_FILES
}
${
API_ABS_INCLUDE_FILES
}
)
ADD_LIBRARY
(
${
STATIC_TARGET
}
STATIC
${
SOURCE_FILES
}
${
SOURCE_INCLUDE_FILES
}
${
API_ABS_INCLUDE_FILES
}
)
TARGET_LINK_LIBRARIES
(
${
STATIC_TARGET
}
${
OPENMM_LIBRARY_NAME
}
${
CUDA_CUDA_LIBRARY
}
${
CUDA_cufft_LIBRARY
}
${
PTHREADS_LIB_STATIC
}
)
TARGET_LINK_LIBRARIES
(
${
STATIC_TARGET
}
${
OPENMM_LIBRARY_NAME
}
${
CUDA_CUDA_LIBRARY
}
${
CUDA_cufft_LIBRARY
}
${
PTHREADS_LIB_STATIC
}
)
SET_TARGET_PROPERTIES
(
${
STATIC_TARGET
}
PROPERTIES COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-DOPENMM_C
UDA
_BUILDING_STATIC_LIBRARY"
)
SET_TARGET_PROPERTIES
(
${
STATIC_TARGET
}
PROPERTIES COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-DOPENMM_C
OMMON
_BUILDING_STATIC_LIBRARY"
)
IF
(
APPLE
)
IF
(
APPLE
)
SET_TARGET_PROPERTIES
(
${
STATIC_TARGET
}
PROPERTIES LINK_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-F/Library/Frameworks -framework CUDA"
)
SET_TARGET_PROPERTIES
(
${
STATIC_TARGET
}
PROPERTIES LINK_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-F/Library/Frameworks -framework CUDA"
)
ELSE
(
APPLE
)
ELSE
(
APPLE
)
...
...
platforms/cuda/tests/TestCuda
BAOAB
LangevinIntegrator.cpp
→
platforms/cuda/tests/TestCudaLangevin
Middle
Integrator.cpp
View file @
5a06df78
...
@@ -30,7 +30,7 @@
...
@@ -30,7 +30,7 @@
* -------------------------------------------------------------------------- */
* -------------------------------------------------------------------------- */
#include "CudaTests.h"
#include "CudaTests.h"
#include "Test
BAOAB
LangevinIntegrator.h"
#include "TestLangevin
Middle
Integrator.h"
void
runPlatformTests
()
{
void
runPlatformTests
()
{
}
}
platforms/cuda/tests/TestCudaNoseHooverIntegrator.cpp
0 → 100644
View file @
5a06df78
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Authors: Andreas Krämer and Andrew C. Simmmonett *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CudaTests.h"
#include "TestNoseHooverIntegrator.h"
void
runPlatformTests
()
{
}
platforms/cuda/tests/TestCudaNoseHooverThermostat.cpp
0 → 100644
View file @
5a06df78
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Authors: Andreas Krämer and Andrew C. Simmonett *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CudaTests.h"
#include "TestNoseHooverThermostat.h"
void
runPlatformTests
()
{
}
Prev
1
…
4
5
6
7
8
9
10
11
12
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment