Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
93c467b2
Commit
93c467b2
authored
Mar 22, 2013
by
Peter Eastman
Browse files
Merged 5.1Optimizations branch back to trunk
parent
f6d4557d
Changes
86
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
1353 additions
and
1263 deletions
+1353
-1263
plugins/amoeba/platforms/cuda/src/kernels/amoebaGk.cu
plugins/amoeba/platforms/cuda/src/kernels/amoebaGk.cu
+227
-153
plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
...eba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
+249
-295
plugins/amoeba/platforms/cuda/src/kernels/multipoleFixedField.cu
.../amoeba/platforms/cuda/src/kernels/multipoleFixedField.cu
+229
-205
plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
...moeba/platforms/cuda/src/kernels/multipoleInducedField.cu
+187
-159
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
+270
-229
plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
.../platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
+191
-222
No files found.
plugins/amoeba/platforms/cuda/src/kernels/amoebaGk.cu
View file @
93c467b2
...
...
@@ -606,181 +606,255 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
*/
extern
"C"
__global__
void
computeEDiffForce
(
unsigned
long
long
*
__restrict__
forceBuffers
,
unsigned
long
long
*
__restrict__
torqueBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
u
nsigned
int
*
__restrict__
exclusionIndice
s
,
const
unsigned
int
*
__restrict__
exclusionRowIndice
s
,
const
u
in
t2
*
__restrict__
covalentFlags
,
const
unsigned
int
*
__restrict__
polarizationGroupFlag
s
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
const
real4
*
__restrict__
posq
,
const
uint
2
*
__restrict__
covalentFlag
s
,
const
unsigned
int
*
__restrict__
polarizationGroupFlag
s
,
const
u
shor
t2
*
__restrict__
exclusionTile
s
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
const
real
*
__restrict__
inducedDipoleS
,
const
real
*
__restrict__
inducedDipolePolarS
,
const
float2
*
__restrict__
dampingAndThole
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
numTiles
=
numTileIndices
;
unsigned
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
real
energy
=
0
;
__shared__
AtomData4
localData
[
EDIFF_THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
(
EDIFF_THREAD_BLOCK_SIZE
/
TILE_SIZE
)];
__shared__
int
exclusionIndex
[
EDIFF_THREAD_BLOCK_SIZE
/
TILE_SIZE
];
// First loop: process tiles that contain exclusions.
do
{
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
u
nsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
u
short2
tileIndices
=
exclusionTiles
[
pos
]
;
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
AtomData4
data
;
if
(
pos
<
end
)
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
data
.
force
=
make_real3
(
0
);
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
loadAtomData4
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
inducedDipoleS
,
inducedDipolePolarS
,
dampingAndThole
);
uint2
covalent
=
covalentFlags
[
pos
*
TILE_SIZE
+
tgx
];
unsigned
int
polarizationGroup
=
polarizationGroupFlags
[
pos
*
TILE_SIZE
+
tgx
];
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
pos
=
data
.
pos
;
localData
[
threadIdx
.
x
].
q
=
data
.
q
;
localData
[
threadIdx
.
x
].
dipole
=
data
.
dipole
;
localData
[
threadIdx
.
x
].
quadrupoleXX
=
data
.
quadrupoleXX
;
localData
[
threadIdx
.
x
].
quadrupoleXY
=
data
.
quadrupoleXY
;
localData
[
threadIdx
.
x
].
quadrupoleXZ
=
data
.
quadrupoleXZ
;
localData
[
threadIdx
.
x
].
quadrupoleYY
=
data
.
quadrupoleYY
;
localData
[
threadIdx
.
x
].
quadrupoleYZ
=
data
.
quadrupoleYZ
;
localData
[
threadIdx
.
x
].
quadrupoleZZ
=
data
.
quadrupoleZZ
;
localData
[
threadIdx
.
x
].
inducedDipole
=
data
.
inducedDipole
;
localData
[
threadIdx
.
x
].
inducedDipolePolar
=
data
.
inducedDipolePolar
;
localData
[
threadIdx
.
x
].
inducedDipoleS
=
data
.
inducedDipoleS
;
localData
[
threadIdx
.
x
].
inducedDipolePolarS
=
data
.
inducedDipolePolarS
;
localData
[
threadIdx
.
x
].
thole
=
data
.
thole
;
localData
[
threadIdx
.
x
].
damp
=
data
.
damp
;
// Compute forces.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
real
tempEnergy
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
computeOneEDiffInteractionF1
(
data
,
localData
[
tbx
+
j
],
d
,
p
,
tempEnergy
,
tempForce
);
energy
+=
0.25
f
*
tempEnergy
;
data
.
force
+=
tempForce
;
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
loadAtomData4
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
inducedDipoleS
,
inducedDipolePolarS
,
dampingAndThole
);
data
.
force
=
make_real3
(
0
);
// Locate the exclusion data for this tile.
if
(
tgx
<
2
)
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex
[
localGroupIndex
]
>
-
1
);
if
(
pos
>=
end
)
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
data
.
force
*=
ENERGY_SCALE_FACTOR
;
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
localData
[
threadIdx
.
x
].
pos
=
data
.
pos
;
localData
[
threadIdx
.
x
].
q
=
data
.
q
;
localData
[
threadIdx
.
x
].
dipole
=
data
.
dipole
;
localData
[
threadIdx
.
x
].
quadrupoleXX
=
data
.
quadrupoleXX
;
localData
[
threadIdx
.
x
].
quadrupoleXY
=
data
.
quadrupoleXY
;
localData
[
threadIdx
.
x
].
quadrupoleXZ
=
data
.
quadrupoleXZ
;
localData
[
threadIdx
.
x
].
quadrupoleYY
=
data
.
quadrupoleYY
;
localData
[
threadIdx
.
x
].
quadrupoleYZ
=
data
.
quadrupoleYZ
;
localData
[
threadIdx
.
x
].
quadrupoleZZ
=
data
.
quadrupoleZZ
;
localData
[
threadIdx
.
x
].
inducedDipole
=
data
.
inducedDipole
;
localData
[
threadIdx
.
x
].
inducedDipolePolar
=
data
.
inducedDipolePolar
;
localData
[
threadIdx
.
x
].
inducedDipoleS
=
data
.
inducedDipoleS
;
localData
[
threadIdx
.
x
].
inducedDipolePolarS
=
data
.
inducedDipolePolarS
;
localData
[
threadIdx
.
x
].
thole
=
data
.
thole
;
localData
[
threadIdx
.
x
].
damp
=
data
.
damp
;
uint2
covalent
=
covalentFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
unsigned
int
polarizationGroup
=
polarizationGroupFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
// Compute forces.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
real
tempEnergy
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
computeOneEDiffInteractionF1
(
data
,
localData
[
tbx
+
j
],
d
,
p
,
tempEnergy
,
tempForce
);
energy
+=
0.25
f
*
tempEnergy
;
data
.
force
+=
tempForce
;
}
// Compute torques.
data
.
force
=
make_real3
(
0
);
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempTorque
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
computeOneEDiffInteractionT1
(
data
,
localData
[
tbx
+
j
],
d
,
p
,
tempTorque
);
data
.
force
+=
tempTorque
;
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
atomicAdd
(
&
torqueBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
}
else
{
// This is an off-diagonal tile.
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
loadAtomData4
(
localData
[
threadIdx
.
x
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
inducedDipoleS
,
inducedDipolePolarS
,
dampingAndThole
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
// Compute forces.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
real
tempEnergy
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
computeOneEDiffInteractionF1
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
tempEnergy
,
tempForce
);
energy
+=
0.5
f
*
tempEnergy
;
data
.
force
+=
tempForce
;
localData
[
tbx
+
tj
].
force
-=
tempForce
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
// Compute torques.
// Compute torques.
data
.
force
=
make_real3
(
0
);
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempTorque
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
computeOneEDiffInteractionT1
(
data
,
localData
[
tbx
+
j
],
d
,
p
,
tempTorque
);
data
.
force
+=
tempTorque
;
}
data
.
force
=
make_real3
(
0
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempTorque
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
computeOneEDiffInteractionT1
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
tempTorque
);
data
.
force
+=
tempTorque
;
computeOneEDiffInteractionT3
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
tempTorque
);
localData
[
tbx
+
tj
].
force
+=
tempTorque
;
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
atomicAdd
(
&
torqueBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
else
{
// This is an off-diagonal tile.
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
}
}
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
loadAtomData4
(
localData
[
threadIdx
.
x
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
inducedDipoleS
,
inducedDipolePolarS
,
dampingAndThole
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
uint2
covalent
=
(
hasExclusions
?
covalentFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
make_uint2
(
0
,
0
));
unsigned
int
polarizationGroup
=
(
hasExclusions
?
polarizationGroupFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
0
);
// Second loop: tiles without exclusions (by enumerating all of them, since there's no cutoff).
// Compute forces.
const
unsigned
int
numTiles
=
numTileIndices
;
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
skipTiles
[
EDIFF_THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
real
tempEnergy
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
computeOneEDiffInteractionF1
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
tempEnergy
,
tempForce
);
energy
+=
0.5
f
*
tempEnergy
;
data
.
force
+=
tempForce
;
localData
[
tbx
+
tj
].
force
-=
tempForce
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
if
(
pos
<
end
)
{
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
}
while
(
pos
<
end
)
{
// Extract the coordinates of this tile.
// Compute torques.
unsigned
int
x
,
y
;
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
data
.
force
=
make_real3
(
0
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempTorque
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
computeOneEDiffInteractionT1
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
tempTorque
);
data
.
force
+=
tempTorque
;
computeOneEDiffInteractionT3
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
tempTorque
);
localData
[
tbx
+
tj
].
force
+=
tempTorque
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
bool
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
AtomData4
data
;
data
.
force
=
make_real3
(
0
);
loadAtomData4
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
inducedDipoleS
,
inducedDipolePolarS
,
dampingAndThole
);
loadAtomData4
(
localData
[
threadIdx
.
x
],
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
inducedDipoleS
,
inducedDipolePolarS
,
dampingAndThole
);
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
loadAtomData4
(
localData
[
threadIdx
.
x
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
inducedDipoleS
,
inducedDipolePolarS
,
dampingAndThole
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
// Compute forces.
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
real
tempEnergy
;
computeOneEDiffInteractionF1
(
data
,
localData
[
tbx
+
tj
],
1
,
1
,
tempEnergy
,
tempForce
);
energy
+=
0.5
f
*
tempEnergy
;
data
.
force
+=
tempForce
;
localData
[
tbx
+
tj
].
force
-=
tempForce
;
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
if
(
pos
<
end
)
{
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
// Compute torques.
data
.
force
=
make_real3
(
0
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempTorque
;
computeOneEDiffInteractionT1
(
data
,
localData
[
tbx
+
tj
],
1
,
1
,
tempTorque
);
data
.
force
+=
tempTorque
;
computeOneEDiffInteractionT3
(
data
,
localData
[
tbx
+
tj
],
1
,
1
,
tempTorque
);
localData
[
tbx
+
tj
].
force
+=
tempTorque
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
}
pos
++
;
}
while
(
pos
<
end
);
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
*
ENERGY_SCALE_FACTOR
;
}
plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
View file @
93c467b2
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef
struct
{
...
...
@@ -59,331 +58,286 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
*/
extern
"C"
__global__
void
computeElectrostatics
(
unsigned
long
long
*
__restrict__
forceBuffers
,
unsigned
long
long
*
__restrict__
torqueBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
u
nsigned
int
*
__restrict__
exclusionIndice
s
,
const
unsigned
int
*
__restrict__
exclusionRowIndice
s
,
const
u
in
t2
*
__restrict__
covalentFlags
,
const
unsigned
int
*
__restrict__
polarizationGroupFlag
s
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
const
real4
*
__restrict__
posq
,
const
uint
2
*
__restrict__
covalentFlag
s
,
const
unsigned
int
*
__restrict__
polarizationGroupFlag
s
,
const
u
shor
t2
*
__restrict__
exclusionTile
s
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
,
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
,
#endif
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
const
float2
*
__restrict__
dampingAndThole
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
real
energy
=
0
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
AtomData
data
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
loadAtomData
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
data
.
force
=
make_real3
(
0
);
uint2
covalent
=
covalentFlags
[
pos
*
TILE_SIZE
+
tgx
];
unsigned
int
polarizationGroup
=
polarizationGroupFlags
[
pos
*
TILE_SIZE
+
tgx
];
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
posq
=
data
.
posq
;
localData
[
threadIdx
.
x
].
dipole
=
data
.
dipole
;
localData
[
threadIdx
.
x
].
quadrupoleXX
=
data
.
quadrupoleXX
;
localData
[
threadIdx
.
x
].
quadrupoleXY
=
data
.
quadrupoleXY
;
localData
[
threadIdx
.
x
].
quadrupoleXZ
=
data
.
quadrupoleXZ
;
localData
[
threadIdx
.
x
].
quadrupoleYY
=
data
.
quadrupoleYY
;
localData
[
threadIdx
.
x
].
quadrupoleYZ
=
data
.
quadrupoleYZ
;
localData
[
threadIdx
.
x
].
inducedDipole
=
data
.
inducedDipole
;
localData
[
threadIdx
.
x
].
inducedDipolePolar
=
data
.
inducedDipolePolar
;
localData
[
threadIdx
.
x
].
thole
=
data
.
thole
;
localData
[
threadIdx
.
x
].
damp
=
data
.
damp
;
// Compute forces.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
real
tempEnergy
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
float
m
=
computeMScaleFactor
(
covalent
,
j
);
computeOneInteractionF1
(
data
,
localData
[
tbx
+
j
],
d
,
p
,
m
,
tempEnergy
,
tempForce
);
data
.
force
+=
tempForce
;
energy
+=
0.5
f
*
tempEnergy
;
}
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
// Compute torques.
data
.
force
=
make_real3
(
0
);
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
float
m
=
computeMScaleFactor
(
covalent
,
j
);
computeOneInteractionT1
(
data
,
localData
[
tbx
+
j
],
d
,
p
,
m
,
tempForce
);
data
.
force
+=
tempForce
;
}
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
atomicAdd
(
&
torqueBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
}
else
{
// This is an off-diagonal tile.
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
loadAtomData
(
localData
[
threadIdx
.
x
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
real
tempEnergy
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
float
m
=
computeMScaleFactor
(
covalent
,
tj
);
computeOneInteractionF1
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
m
,
tempEnergy
,
tempForce
);
data
.
force
+=
tempForce
;
localData
[
tbx
+
tj
].
force
-=
tempForce
;
energy
+=
tempEnergy
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
// Compute torques.
data
.
force
=
make_real3
(
0
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
float
m
=
computeMScaleFactor
(
covalent
,
tj
);
computeOneInteractionT1
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
m
,
tempForce
);
data
.
force
+=
tempForce
;
computeOneInteractionT3
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
m
,
tempForce
);
localData
[
tbx
+
tj
].
force
+=
tempForce
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
const
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
unsigned
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
warp
+
1
)
*
numTileIndices
/
totalWarps
:
(
warp
+
1
)
*
numTiles
/
totalWarps
);
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
warp
+
1
)
*
numTileIndices
/
totalWarps
:
(
warp
+
1
)
*
numTiles
/
totalWarps
);
#else
const
unsigned
int
numTiles
=
numTileIndices
;
unsigned
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
real
energy
=
0
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
#ifndef ENABLE_SHUFFLE
__shared__
real
tempBuffer
[
3
*
THREAD_BLOCK_SIZE
];
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
do
{
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
while
(
pos
<
end
)
{
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
AtomData
data
;
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
AtomData
data
;
loadAtomData
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
data
.
force
=
make_real3
(
0
);
// Locate the exclusion data for this tile.
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
loadAtomData
(
localData
[
threadIdx
.
x
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
if
(
tgx
<
2
)
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex
[
localGroupIndex
]
>
-
1
);
if
(
pos
>=
end
)
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
// Compute forces.
localData
[
threadIdx
.
x
].
posq
=
data
.
posq
;
localData
[
threadIdx
.
x
].
dipole
=
data
.
dipole
;
localData
[
threadIdx
.
x
].
quadrupoleXX
=
data
.
quadrupoleXX
;
localData
[
threadIdx
.
x
].
quadrupoleXY
=
data
.
quadrupoleXY
;
localData
[
threadIdx
.
x
].
quadrupoleXZ
=
data
.
quadrupoleXZ
;
localData
[
threadIdx
.
x
].
quadrupoleYY
=
data
.
quadrupoleYY
;
localData
[
threadIdx
.
x
].
quadrupoleYZ
=
data
.
quadrupoleYZ
;
localData
[
threadIdx
.
x
].
inducedDipole
=
data
.
inducedDipole
;
localData
[
threadIdx
.
x
].
inducedDipolePolar
=
data
.
inducedDipolePolar
;
localData
[
threadIdx
.
x
].
thole
=
data
.
thole
;
localData
[
threadIdx
.
x
].
damp
=
data
.
damp
;
uint2
covalent
=
covalentFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
unsigned
int
polarizationGroup
=
polarizationGroupFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
// Compute forces.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
real
tempEnergy
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
float
m
=
computeMScaleFactor
(
covalent
,
j
);
computeOneInteractionF1
(
data
,
localData
[
tbx
+
j
],
d
,
p
,
m
,
tempEnergy
,
tempForce
);
data
.
force
+=
tempForce
;
energy
+=
0.5
f
*
tempEnergy
;
}
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
atomIndices
[
tbx
+
tj
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
real
tempEnergy
;
computeOneInteractionF1
(
data
,
localData
[
tbx
+
tj
],
1
,
1
,
1
,
tempEnergy
,
tempForce
);
data
.
force
+=
tempForce
;
localData
[
tbx
+
tj
].
force
-=
tempForce
;
energy
+=
tempEnergy
;
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
// Compute torques.
data
.
force
=
make_real3
(
0
);
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
float
m
=
computeMScaleFactor
(
covalent
,
j
);
computeOneInteractionT1
(
data
,
localData
[
tbx
+
j
],
d
,
p
,
m
,
tempForce
);
data
.
force
+=
tempForce
;
}
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
atomicAdd
(
&
torqueBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
else
{
// This is an off-diagonal tile.
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
loadAtomData
(
localData
[
threadIdx
.
x
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)))
;
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
))
);
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
))
);
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
if
(
!
hasExclusions
&&
flags
!=
0xFFFFFFFF
)
{
if
(
flags
==
0
)
{
// No interactions in this tile.
}
else
{
// Compute only a subset of the interactions in this tile.
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
int
atom2
=
tbx
+
j
;
real3
delta
=
make_real3
(
localData
[
atom2
].
posq
.
x
-
data
.
posq
.
x
,
localData
[
atom2
].
posq
.
y
-
data
.
posq
.
y
,
localData
[
atom2
].
posq
.
z
-
data
.
posq
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real3
tempForce
;
real
tempEnergy
;
computeOneInteractionF1
(
data
,
localData
[
atom2
],
1
,
1
,
1
,
tempEnergy
,
tempForce
);
data
.
force
+=
tempForce
;
localData
[
atom2
].
force
-=
tempForce
;
energy
+=
tempEnergy
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#ifdef ENABLE_SHUFFLE
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
{
tempForce
.
x
+=
__shfl_xor
(
tempForce
.
x
,
i
,
32
);
tempForce
.
y
+=
__shfl_xor
(
tempForce
.
y
,
i
,
32
);
tempForce
.
z
+=
__shfl_xor
(
tempForce
.
z
,
i
,
32
);
}
if
(
tgx
==
0
)
localData
[
atom2
].
force
-=
tempForce
;
offset
=
atomIndices
[
threadIdx
.
x
];
#else
int
bufferIndex
=
3
*
threadIdx
.
x
;
tempBuffer
[
bufferIndex
]
=
tempForce
.
x
;
tempBuffer
[
bufferIndex
+
1
]
=
tempForce
.
y
;
tempBuffer
[
bufferIndex
+
2
]
=
tempForce
.
z
;
if
(
tgx
%
4
==
0
)
{
tempBuffer
[
bufferIndex
]
+=
tempBuffer
[
bufferIndex
+
3
]
+
tempBuffer
[
bufferIndex
+
6
]
+
tempBuffer
[
bufferIndex
+
9
];
tempBuffer
[
bufferIndex
+
1
]
+=
tempBuffer
[
bufferIndex
+
4
]
+
tempBuffer
[
bufferIndex
+
7
]
+
tempBuffer
[
bufferIndex
+
10
];
tempBuffer
[
bufferIndex
+
2
]
+=
tempBuffer
[
bufferIndex
+
5
]
+
tempBuffer
[
bufferIndex
+
8
]
+
tempBuffer
[
bufferIndex
+
11
];
}
if
(
tgx
==
0
)
{
localData
[
atom2
].
force
.
x
-=
tempBuffer
[
bufferIndex
]
+
tempBuffer
[
bufferIndex
+
12
]
+
tempBuffer
[
bufferIndex
+
24
]
+
tempBuffer
[
bufferIndex
+
36
]
+
tempBuffer
[
bufferIndex
+
48
]
+
tempBuffer
[
bufferIndex
+
60
]
+
tempBuffer
[
bufferIndex
+
72
]
+
tempBuffer
[
bufferIndex
+
84
];
localData
[
atom2
].
force
.
y
-=
tempBuffer
[
bufferIndex
+
1
]
+
tempBuffer
[
bufferIndex
+
13
]
+
tempBuffer
[
bufferIndex
+
25
]
+
tempBuffer
[
bufferIndex
+
37
]
+
tempBuffer
[
bufferIndex
+
49
]
+
tempBuffer
[
bufferIndex
+
61
]
+
tempBuffer
[
bufferIndex
+
73
]
+
tempBuffer
[
bufferIndex
+
85
];
localData
[
atom2
].
force
.
z
-=
tempBuffer
[
bufferIndex
+
2
]
+
tempBuffer
[
bufferIndex
+
14
]
+
tempBuffer
[
bufferIndex
+
26
]
+
tempBuffer
[
bufferIndex
+
38
]
+
tempBuffer
[
bufferIndex
+
50
]
+
tempBuffer
[
bufferIndex
+
62
]
+
tempBuffer
[
bufferIndex
+
74
]
+
tempBuffer
[
bufferIndex
+
86
];
}
offset
=
y
*
TILE_SIZE
+
tgx
;
#endif
}
}
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
if
(
pos
<
end
)
{
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
}
// Compute torques.
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
data
.
force
=
make_real3
(
0
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
int
atom2
=
tbx
+
j
;
real3
delta
=
make_real3
(
localData
[
atom2
].
posq
.
x
-
data
.
posq
.
x
,
localData
[
atom2
].
posq
.
y
-
data
.
posq
.
y
,
localData
[
atom2
].
posq
.
z
-
data
.
posq
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real3
tempForce
;
computeOneInteractionT1
(
data
,
localData
[
atom2
],
1
,
1
,
1
,
tempForce
);
data
.
force
+=
tempForce
;
computeOneInteractionT3
(
data
,
localData
[
atom2
],
1
,
1
,
1
,
tempForce
);
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#ifdef ENABLE_SHUFFLE
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
{
tempForce
.
x
+=
__shfl_xor
(
tempForce
.
x
,
i
,
32
);
tempForce
.
y
+=
__shfl_xor
(
tempForce
.
y
,
i
,
32
);
tempForce
.
z
+=
__shfl_xor
(
tempForce
.
z
,
i
,
32
);
}
if
(
tgx
==
0
)
localData
[
atom2
].
force
-=
tempForce
;
#else
int
bufferIndex
=
3
*
threadIdx
.
x
;
tempBuffer
[
bufferIndex
]
=
tempForce
.
x
;
tempBuffer
[
bufferIndex
+
1
]
=
tempForce
.
y
;
tempBuffer
[
bufferIndex
+
2
]
=
tempForce
.
z
;
if
(
tgx
%
4
==
0
)
{
tempBuffer
[
bufferIndex
]
+=
tempBuffer
[
bufferIndex
+
3
]
+
tempBuffer
[
bufferIndex
+
6
]
+
tempBuffer
[
bufferIndex
+
9
];
tempBuffer
[
bufferIndex
+
1
]
+=
tempBuffer
[
bufferIndex
+
4
]
+
tempBuffer
[
bufferIndex
+
7
]
+
tempBuffer
[
bufferIndex
+
10
];
tempBuffer
[
bufferIndex
+
2
]
+=
tempBuffer
[
bufferIndex
+
5
]
+
tempBuffer
[
bufferIndex
+
8
]
+
tempBuffer
[
bufferIndex
+
11
];
}
if
(
tgx
==
0
)
{
localData
[
atom2
].
force
.
x
+=
tempBuffer
[
bufferIndex
]
+
tempBuffer
[
bufferIndex
+
12
]
+
tempBuffer
[
bufferIndex
+
24
]
+
tempBuffer
[
bufferIndex
+
36
]
+
tempBuffer
[
bufferIndex
+
48
]
+
tempBuffer
[
bufferIndex
+
60
]
+
tempBuffer
[
bufferIndex
+
72
]
+
tempBuffer
[
bufferIndex
+
84
];
localData
[
atom2
].
force
.
y
+=
tempBuffer
[
bufferIndex
+
1
]
+
tempBuffer
[
bufferIndex
+
13
]
+
tempBuffer
[
bufferIndex
+
25
]
+
tempBuffer
[
bufferIndex
+
37
]
+
tempBuffer
[
bufferIndex
+
49
]
+
tempBuffer
[
bufferIndex
+
61
]
+
tempBuffer
[
bufferIndex
+
73
]
+
tempBuffer
[
bufferIndex
+
85
];
localData
[
atom2
].
force
.
z
+=
tempBuffer
[
bufferIndex
+
2
]
+
tempBuffer
[
bufferIndex
+
14
]
+
tempBuffer
[
bufferIndex
+
26
]
+
tempBuffer
[
bufferIndex
+
38
]
+
tempBuffer
[
bufferIndex
+
50
]
+
tempBuffer
[
bufferIndex
+
62
]
+
tempBuffer
[
bufferIndex
+
74
]
+
tempBuffer
[
bufferIndex
+
86
];
}
#endif
}
}
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
if
(
pos
<
end
)
{
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
// Compute torques.
uint2
covalent
=
(
hasExclusions
?
covalentFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
make_uint2
(
0
,
0
));
unsigned
int
polarizationGroup
=
(
hasExclusions
?
polarizationGroupFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
0
);
// Compute forces.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
real
tempEnergy
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
float
m
=
computeMScaleFactor
(
covalent
,
tj
);
computeOneInteractionF1
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
m
,
tempEnergy
,
tempForce
);
data
.
force
+=
tempForce
;
localData
[
tbx
+
tj
].
force
-=
tempForce
;
energy
+=
tempEnergy
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
if
(
pos
<
end
)
{
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
}
// Compute torques.
data
.
force
=
make_real3
(
0
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
float
m
=
computeMScaleFactor
(
covalent
,
tj
);
computeOneInteractionT1
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
m
,
tempForce
);
data
.
force
+=
tempForce
;
computeOneInteractionT3
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
m
,
tempForce
);
localData
[
tbx
+
tj
].
force
+=
tempForce
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
if
(
pos
<
end
)
{
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
}
data
.
force
=
make_real3
(
0
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
computeOneInteractionT1
(
data
,
localData
[
tbx
+
tj
],
1
,
1
,
1
,
tempForce
);
data
.
force
+=
tempForce
;
computeOneInteractionT3
(
data
,
localData
[
tbx
+
tj
],
1
,
1
,
1
,
tempForce
);
localData
[
tbx
+
tj
].
force
+=
tempForce
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
#ifdef USE_CUTOFF
offset
=
atomIndices
[
threadIdx
.
x
];
#else
offset
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
}
pos
++
;
}
while
(
pos
<
end
);
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
*
ENERGY_SCALE_FACTOR
;
}
plugins/amoeba/platforms/cuda/src/kernels/multipoleFixedField.cu
View file @
93c467b2
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef
struct
{
...
...
@@ -398,245 +397,268 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
*/
extern
"C"
__global__
void
computeFixedField
(
unsigned
long
long
*
__restrict__
fieldBuffers
,
unsigned
long
long
*
__restrict__
fieldPolarBuffers
,
const
real4
*
__restrict__
posq
,
const
u
nsigned
int
*
__restrict__
exclusionIndices
,
const
unsigned
int
*
__restrict__
exclusion
RowIndic
es
,
const
uint2
*
__restrict__
covalentFlags
,
const
unsigned
int
*
__restrict__
polarizationGroupFlags
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
const
u
int2
*
__restrict__
covalentFlags
,
const
unsigned
int
*
__restrict__
polarizationGroupFlags
,
const
ushort2
*
__restrict__
exclusion
Til
es
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
,
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
,
#elif defined USE_GK
const
real
*
__restrict__
bornRadii
,
unsigned
long
long
*
__restrict__
gkFieldBuffers
,
#endif
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
const
float2
*
__restrict__
dampingAndThole
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
const
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
unsigned
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
warp
+
1
)
*
numTileIndices
/
totalWarps
:
(
warp
+
1
)
*
numTiles
/
totalWarps
);
#else
const
unsigned
int
numTiles
=
numTileIndices
;
unsigned
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
#ifndef ENABLE_SHUFFLE
__shared__
real
tempBuffer
[
3
*
THREAD_BLOCK_SIZE
];
#endif
// First loop: process tiles that contain exclusions.
do
{
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
u
nsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
u
short2
tileIndices
=
exclusionTiles
[
pos
]
;
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
AtomData
data
;
data
.
field
=
make_real3
(
0
);
data
.
fieldPolar
=
make_real3
(
0
);
#ifdef USE_GK
data
.
gkField
=
make_real3
(
0
);
#endif
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
loadAtomData
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
dampingAndThole
);
#ifdef USE_GK
data
.
bornRadius
=
bornRadii
[
atom1
];
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
uint2
covalent
=
covalentFlags
[
pos
*
TILE_SIZE
+
tgx
];
unsigned
int
polarizationGroup
=
polarizationGroupFlags
[
pos
*
TILE_SIZE
+
tgx
];
if
(
x
==
y
)
{
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
posq
=
data
.
posq
;
localData
[
localAtomIndex
].
dipole
=
data
.
dipole
;
localData
[
localAtomIndex
].
quadrupoleXX
=
data
.
quadrupoleXX
;
localData
[
localAtomIndex
].
quadrupoleXY
=
data
.
quadrupoleXY
;
localData
[
localAtomIndex
].
quadrupoleXZ
=
data
.
quadrupoleXZ
;
localData
[
localAtomIndex
].
quadrupoleYY
=
data
.
quadrupoleYY
;
localData
[
localAtomIndex
].
quadrupoleYZ
=
data
.
quadrupoleYZ
;
localData
[
localAtomIndex
].
quadrupoleZZ
=
data
.
quadrupoleZZ
;
localData
[
localAtomIndex
].
thole
=
data
.
thole
;
localData
[
localAtomIndex
].
damp
=
data
.
damp
;
#ifdef USE_GK
localData
[
localAtomIndex
].
bornRadius
=
data
.
bornRadius
;
#endif
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
trimTo3
(
localData
[
tbx
+
j
].
posq
-
data
.
posq
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
fields
[
4
];
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
computeOneInteraction
(
data
,
localData
[
tbx
+
j
],
delta
,
d
,
p
,
fields
);
data
.
field
+=
fields
[
0
];
data
.
fieldPolar
+=
fields
[
1
];
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
loadAtomData
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
dampingAndThole
);
#ifdef USE_GK
data
.
bornRadius
=
bornRadii
[
atom1
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
fields
[
2
];
computeOneGkInteraction
(
data
,
localData
[
tbx
+
j
],
delta
,
fields
);
data
.
gkField
+=
fields
[
0
];
}
#endif
// Locate the exclusion data for this tile.
if
(
tgx
<
2
)
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex
[
localGroupIndex
]
>
-
1
);
if
(
pos
>=
end
)
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
posq
=
data
.
posq
;
localData
[
localAtomIndex
].
dipole
=
data
.
dipole
;
localData
[
localAtomIndex
].
quadrupoleXX
=
data
.
quadrupoleXX
;
localData
[
localAtomIndex
].
quadrupoleXY
=
data
.
quadrupoleXY
;
localData
[
localAtomIndex
].
quadrupoleXZ
=
data
.
quadrupoleXZ
;
localData
[
localAtomIndex
].
quadrupoleYY
=
data
.
quadrupoleYY
;
localData
[
localAtomIndex
].
quadrupoleYZ
=
data
.
quadrupoleYZ
;
localData
[
localAtomIndex
].
quadrupoleZZ
=
data
.
quadrupoleZZ
;
localData
[
localAtomIndex
].
thole
=
data
.
thole
;
localData
[
localAtomIndex
].
damp
=
data
.
damp
;
}
}
else
{
// This is an off-diagonal tile.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
loadAtomData
(
localData
[
localAtomIndex
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
dampingAndThole
);
localData
[
localAtomIndex
].
field
=
make_real3
(
0
);
localData
[
localAtomIndex
].
fieldPolar
=
make_real3
(
0
);
#ifdef USE_GK
localData
[
localAtomIndex
].
bornRadius
=
data
.
bornRadius
;
localData
[
localAtomIndex
].
bornRadius
=
bornRadii
[
j
];
localData
[
localAtomIndex
].
gkField
=
make_real3
(
0
);
#endif
uint2
covalent
=
covalentFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
unsigned
int
polarizationGroup
=
polarizationGroupFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
trimTo3
(
localData
[
tbx
+
j
].
posq
-
data
.
posq
);
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
trimTo3
(
localData
[
tbx
+
tj
].
posq
-
data
.
posq
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
fields
[
4
];
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
computeOneInteraction
(
data
,
localData
[
tbx
+
j
],
delta
,
d
,
p
,
fields
);
data
.
field
+=
fields
[
0
];
data
.
fieldPolar
+=
fields
[
1
];
}
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
fields
[
4
];
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
computeOneInteraction
(
data
,
localData
[
tbx
+
tj
],
delta
,
d
,
p
,
fields
);
data
.
field
+=
fields
[
0
];
data
.
fieldPolar
+=
fields
[
1
];
localData
[
tbx
+
tj
].
field
+=
fields
[
2
];
localData
[
tbx
+
tj
].
fieldPolar
+=
fields
[
3
];
#ifdef USE_GK
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
fields
[
2
];
computeOneGkInteraction
(
data
,
localData
[
tbx
+
j
],
delta
,
fields
);
data
.
gkField
+=
fields
[
0
];
}
computeOneGkInteraction
(
data
,
localData
[
tbx
+
tj
],
delta
,
fields
);
data
.
gkField
+=
fields
[
0
];
localData
[
tbx
+
tj
].
gkField
+=
fields
[
1
];
#endif
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
else
{
// This is an off-diagonal tile.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
loadAtomData
(
localData
[
localAtomIndex
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
dampingAndThole
);
localData
[
localAtomIndex
].
field
=
make_real3
(
0
);
localData
[
localAtomIndex
].
fieldPolar
=
make_real3
(
0
);
}
// Write results.
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
fieldBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
z
*
0x100000000
)));
atomicAdd
(
&
fieldPolarBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolar
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldPolarBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolar
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldPolarBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolar
.
z
*
0x100000000
)));
#ifdef USE_GK
localData
[
localAtomIndex
].
bornRadius
=
bornRadii
[
j
];
localData
[
localAtomIndex
].
gkField
=
make_real3
(
0
);
atomicAdd
(
&
gkFieldBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
gkField
.
x
*
0x100000000
)));
atomicAdd
(
&
gkFieldBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
gkField
.
y
*
0x100000000
)));
atomicAdd
(
&
gkFieldBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
gkField
.
z
*
0x100000000
)));
#endif
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
if
(
!
hasExclusions
&&
flags
==
0
)
{
// TODO: Why doesn't the flags != 0 block work?
// if (!hasExclusions && flags != 0xFFFFFFFF) {
if
(
flags
==
0
)
{
// No interactions in this tile.
}
else
{
// Compute only a subset of the interactions in this tile.
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
int
atom2
=
tbx
+
j
;
real3
delta
=
make_real3
(
localData
[
atom2
].
posq
.
x
-
data
.
posq
.
x
,
localData
[
atom2
].
posq
.
y
-
data
.
posq
.
y
,
localData
[
atom2
].
posq
.
z
-
data
.
posq
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
if
(
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
fieldBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
z
*
0x100000000
)));
atomicAdd
(
&
fieldPolarBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldPolar
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldPolarBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldPolar
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldPolarBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldPolar
.
z
*
0x100000000
)));
#ifdef USE_GK
atomicAdd
(
&
gkFieldBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
gkField
.
x
*
0x100000000
)));
atomicAdd
(
&
gkFieldBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
gkField
.
y
*
0x100000000
)));
atomicAdd
(
&
gkFieldBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
gkField
.
z
*
0x100000000
)));
#endif
real3
fields
[
4
];
computeOneInteraction
(
data
,
localData
[
atom2
],
delta
,
1
,
1
,
fields
);
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#ifdef ENABLE_SHUFFLE
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
{
fields
[
2
].
x
+=
__shfl_xor
(
fields
[
2
].
x
,
i
,
32
);
fields
[
2
].
y
+=
__shfl_xor
(
fields
[
2
].
y
,
i
,
32
);
fields
[
2
].
z
+=
__shfl_xor
(
fields
[
2
].
z
,
i
,
32
);
fields
[
3
].
x
+=
__shfl_xor
(
fields
[
3
].
x
,
i
,
32
);
fields
[
3
].
y
+=
__shfl_xor
(
fields
[
3
].
y
,
i
,
32
);
fields
[
3
].
z
+=
__shfl_xor
(
fields
[
3
].
z
,
i
,
32
);
}
if
(
tgx
==
0
)
{
localData
[
atom2
].
field
+=
fields
[
2
];
localData
[
atom2
].
fieldPolar
+=
fields
[
3
];
}
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
const
unsigned
int
numTiles
=
interactionCount
[
0
];
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
warp
+
1
)
*
numTileIndices
/
totalWarps
:
(
warp
+
1
)
*
numTiles
/
totalWarps
);
#else
int
bufferIndex
=
3
*
threadIdx
.
x
;
tempBuffer
[
bufferIndex
]
=
fields
[
2
].
x
;
tempBuffer
[
bufferIndex
+
1
]
=
fields
[
2
].
y
;
tempBuffer
[
bufferIndex
+
2
]
=
fields
[
2
].
z
;
if
(
tgx
%
4
==
0
)
{
tempBuffer
[
bufferIndex
]
+=
tempBuffer
[
bufferIndex
+
3
]
+
tempBuffer
[
bufferIndex
+
6
]
+
tempBuffer
[
bufferIndex
+
9
];
tempBuffer
[
bufferIndex
+
1
]
+=
tempBuffer
[
bufferIndex
+
4
]
+
tempBuffer
[
bufferIndex
+
7
]
+
tempBuffer
[
bufferIndex
+
10
];
tempBuffer
[
bufferIndex
+
2
]
+=
tempBuffer
[
bufferIndex
+
5
]
+
tempBuffer
[
bufferIndex
+
8
]
+
tempBuffer
[
bufferIndex
+
11
];
}
if
(
tgx
==
0
)
{
localData
[
atom2
].
field
.
x
+=
tempBuffer
[
bufferIndex
]
+
tempBuffer
[
bufferIndex
+
12
]
+
tempBuffer
[
bufferIndex
+
24
]
+
tempBuffer
[
bufferIndex
+
36
]
+
tempBuffer
[
bufferIndex
+
48
]
+
tempBuffer
[
bufferIndex
+
60
]
+
tempBuffer
[
bufferIndex
+
72
]
+
tempBuffer
[
bufferIndex
+
84
];
localData
[
atom2
].
field
.
y
+=
tempBuffer
[
bufferIndex
+
1
]
+
tempBuffer
[
bufferIndex
+
13
]
+
tempBuffer
[
bufferIndex
+
25
]
+
tempBuffer
[
bufferIndex
+
37
]
+
tempBuffer
[
bufferIndex
+
49
]
+
tempBuffer
[
bufferIndex
+
61
]
+
tempBuffer
[
bufferIndex
+
73
]
+
tempBuffer
[
bufferIndex
+
85
];
localData
[
atom2
].
field
.
z
+=
tempBuffer
[
bufferIndex
+
2
]
+
tempBuffer
[
bufferIndex
+
14
]
+
tempBuffer
[
bufferIndex
+
26
]
+
tempBuffer
[
bufferIndex
+
38
]
+
tempBuffer
[
bufferIndex
+
50
]
+
tempBuffer
[
bufferIndex
+
62
]
+
tempBuffer
[
bufferIndex
+
74
]
+
tempBuffer
[
bufferIndex
+
86
];
}
tempBuffer
[
bufferIndex
]
=
fields
[
3
].
x
;
tempBuffer
[
bufferIndex
+
1
]
=
fields
[
3
].
y
;
tempBuffer
[
bufferIndex
+
2
]
=
fields
[
3
].
z
;
if
(
tgx
%
4
==
0
)
{
tempBuffer
[
bufferIndex
]
+=
tempBuffer
[
bufferIndex
+
3
]
+
tempBuffer
[
bufferIndex
+
6
]
+
tempBuffer
[
bufferIndex
+
9
];
tempBuffer
[
bufferIndex
+
1
]
+=
tempBuffer
[
bufferIndex
+
4
]
+
tempBuffer
[
bufferIndex
+
7
]
+
tempBuffer
[
bufferIndex
+
10
];
tempBuffer
[
bufferIndex
+
2
]
+=
tempBuffer
[
bufferIndex
+
5
]
+
tempBuffer
[
bufferIndex
+
8
]
+
tempBuffer
[
bufferIndex
+
11
];
}
if
(
tgx
==
0
)
{
localData
[
atom2
].
fieldPolar
.
x
+=
tempBuffer
[
bufferIndex
]
+
tempBuffer
[
bufferIndex
+
12
]
+
tempBuffer
[
bufferIndex
+
24
]
+
tempBuffer
[
bufferIndex
+
36
]
+
tempBuffer
[
bufferIndex
+
48
]
+
tempBuffer
[
bufferIndex
+
60
]
+
tempBuffer
[
bufferIndex
+
72
]
+
tempBuffer
[
bufferIndex
+
84
];
localData
[
atom2
].
fieldPolar
.
y
+=
tempBuffer
[
bufferIndex
+
1
]
+
tempBuffer
[
bufferIndex
+
13
]
+
tempBuffer
[
bufferIndex
+
25
]
+
tempBuffer
[
bufferIndex
+
37
]
+
tempBuffer
[
bufferIndex
+
49
]
+
tempBuffer
[
bufferIndex
+
61
]
+
tempBuffer
[
bufferIndex
+
73
]
+
tempBuffer
[
bufferIndex
+
85
];
localData
[
atom2
].
fieldPolar
.
z
+=
tempBuffer
[
bufferIndex
+
2
]
+
tempBuffer
[
bufferIndex
+
14
]
+
tempBuffer
[
bufferIndex
+
26
]
+
tempBuffer
[
bufferIndex
+
38
]
+
tempBuffer
[
bufferIndex
+
50
]
+
tempBuffer
[
bufferIndex
+
62
]
+
tempBuffer
[
bufferIndex
+
74
]
+
tempBuffer
[
bufferIndex
+
86
];
}
const
unsigned
int
numTiles
=
numTileIndices
;
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
}
}
}
}
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
while
(
pos
<
end
)
{
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
AtomData
data
;
data
.
field
=
make_real3
(
0
);
data
.
fieldPolar
=
make_real3
(
0
);
#ifdef USE_GK
data
.
gkField
=
make_real3
(
0
);
#endif
loadAtomData
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
dampingAndThole
);
#ifdef USE_GK
data
.
bornRadius
=
bornRadii
[
atom1
];
#endif
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
{
// Compute the full set of interactions in this tile.
uint2
covalent
=
(
hasExclusions
?
covalentFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
make_uint2
(
0
,
0
));
unsigned
int
polarizationGroup
=
(
hasExclusions
?
polarizationGroupFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
0
);
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
trimTo3
(
localData
[
tbx
+
tj
].
posq
-
data
.
posq
);
atomIndices
[
threadIdx
.
x
]
=
j
;
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
loadAtomData
(
localData
[
localAtomIndex
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
dampingAndThole
);
localData
[
localAtomIndex
].
field
=
make_real3
(
0
);
localData
[
localAtomIndex
].
fieldPolar
=
make_real3
(
0
);
#ifdef USE_GK
localData
[
localAtomIndex
].
bornRadius
=
bornRadii
[
j
];
localData
[
localAtomIndex
].
gkField
=
make_real3
(
0
);
#endif
// Compute the full set of interactions in this tile.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
trimTo3
(
localData
[
tbx
+
tj
].
posq
-
data
.
posq
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
fields
[
4
];
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
computeOneInteraction
(
data
,
localData
[
tbx
+
tj
],
delta
,
d
,
p
,
fields
);
data
.
field
+=
fields
[
0
];
data
.
fieldPolar
+=
fields
[
1
];
localData
[
tbx
+
tj
].
field
+=
fields
[
2
];
localData
[
tbx
+
tj
].
fieldPolar
+=
fields
[
3
];
int
atom2
=
atomIndices
[
tbx
+
tj
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
fields
[
4
];
computeOneInteraction
(
data
,
localData
[
tbx
+
tj
],
delta
,
1
,
1
,
fields
);
data
.
field
+=
fields
[
0
];
data
.
fieldPolar
+=
fields
[
1
];
localData
[
tbx
+
tj
].
field
+=
fields
[
2
];
localData
[
tbx
+
tj
].
fieldPolar
+=
fields
[
3
];
#ifdef USE_GK
computeOneGkInteraction
(
data
,
localData
[
tbx
+
tj
],
delta
,
fields
);
data
.
gkField
+=
fields
[
0
];
localData
[
tbx
+
tj
].
gkField
+=
fields
[
1
];
computeOneGkInteraction
(
data
,
localData
[
tbx
+
tj
],
delta
,
fields
);
data
.
gkField
+=
fields
[
0
];
localData
[
tbx
+
tj
].
gkField
+=
fields
[
1
];
#endif
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
if
(
pos
<
end
)
{
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
// Write results.
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
fieldBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
z
*
0x100000000
)));
...
...
@@ -648,9 +670,11 @@ extern "C" __global__ void computeFixedField(
atomicAdd
(
&
gkFieldBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
gkField
.
y
*
0x100000000
)));
atomicAdd
(
&
gkFieldBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
gkField
.
z
*
0x100000000
)));
#endif
}
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
#ifdef USE_CUTOFF
offset
=
atomIndices
[
threadIdx
.
x
];
#else
offset
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomicAdd
(
&
fieldBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
z
*
0x100000000
)));
...
...
@@ -664,5 +688,5 @@ extern "C" __global__ void computeFixedField(
#endif
}
pos
++
;
}
while
(
pos
<
end
);
}
}
plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
View file @
93c467b2
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef
struct
{
...
...
@@ -199,194 +198,221 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
* Compute the mutual induced field.
*/
extern
"C"
__global__
void
computeInducedField
(
unsigned
long
long
*
__restrict__
field
,
unsigned
long
long
*
__restrict__
fieldPolar
,
const
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
field
,
unsigned
long
long
*
__restrict__
fieldPolar
,
const
real4
*
__restrict__
posq
,
const
ushort2
*
__restrict__
exclusionTiles
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
,
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
,
#elif defined USE_GK
unsigned
long
long
*
__restrict__
fieldS
,
unsigned
long
long
*
__restrict__
fieldPolarS
,
const
real
*
__restrict__
inducedDipoleS
,
const
real
*
__restrict__
inducedDipolePolarS
,
const
real
*
__restrict__
bornRadii
,
#endif
const
float2
*
__restrict__
dampingAndThole
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
const
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
unsigned
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
warp
+
1
)
*
numTileIndices
/
totalWarps
:
(
warp
+
1
)
*
numTiles
/
totalWarps
);
#else
const
unsigned
int
numTiles
=
numTileIndices
;
unsigned
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
#ifndef ENABLE_SHUFFLE
// __shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
#endif
// First loop: process tiles that contain exclusions.
do
{
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
unsigned
int
x
,
y
;
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
AtomData
data
;
zeroAtomData
(
data
);
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
#ifdef USE_GK
loadAtomData
(
data
,
atom1
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
,
inducedDipoleS
,
inducedDipolePolarS
,
bornRadii
);
loadAtomData
(
data
,
atom1
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
,
inducedDipoleS
,
inducedDipolePolarS
,
bornRadii
);
#else
loadAtomData
(
data
,
atom1
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
loadAtomData
(
data
,
atom1
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
#endif
if
(
pos
>=
end
)
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
pos
=
data
.
pos
;
localData
[
threadIdx
.
x
].
inducedDipole
=
data
.
inducedDipole
;
localData
[
threadIdx
.
x
].
inducedDipolePolar
=
data
.
inducedDipolePolar
;
localData
[
threadIdx
.
x
].
thole
=
data
.
thole
;
localData
[
threadIdx
.
x
].
damp
=
data
.
damp
;
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
pos
=
data
.
pos
;
localData
[
threadIdx
.
x
].
inducedDipole
=
data
.
inducedDipole
;
localData
[
threadIdx
.
x
].
inducedDipolePolar
=
data
.
inducedDipolePolar
;
localData
[
threadIdx
.
x
].
thole
=
data
.
thole
;
localData
[
threadIdx
.
x
].
damp
=
data
.
damp
;
#ifdef USE_GK
localData
[
threadIdx
.
x
].
inducedDipoleS
=
data
.
inducedDipoleS
;
localData
[
threadIdx
.
x
].
inducedDipolePolarS
=
data
.
inducedDipolePolarS
;
localData
[
threadIdx
.
x
].
bornRadius
=
data
.
bornRadius
;
localData
[
threadIdx
.
x
].
inducedDipoleS
=
data
.
inducedDipoleS
;
localData
[
threadIdx
.
x
].
inducedDipolePolarS
=
data
.
inducedDipolePolarS
;
localData
[
threadIdx
.
x
].
bornRadius
=
data
.
bornRadius
;
#endif
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
localData
[
tbx
+
j
].
pos
-
data
.
pos
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
localData
[
tbx
+
j
].
pos
-
data
.
pos
;
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
computeOneInteraction
(
data
,
localData
[
tbx
+
j
],
delta
,
atom1
==
atom2
);
}
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
computeOneInteraction
(
data
,
localData
[
tbx
+
j
],
delta
,
atom1
==
atom2
);
}
else
{
// This is an off-diagonal tile.
}
else
{
// This is an off-diagonal tile.
#ifdef USE_GK
loadAtomData
(
localData
[
threadIdx
.
x
],
y
*
TILE_SIZE
+
tgx
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
,
inducedDipoleS
,
inducedDipolePolarS
,
bornRadii
);
loadAtomData
(
localData
[
threadIdx
.
x
],
y
*
TILE_SIZE
+
tgx
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
,
inducedDipoleS
,
inducedDipolePolarS
,
bornRadii
);
#else
loadAtomData
(
localData
[
threadIdx
.
x
],
y
*
TILE_SIZE
+
tgx
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
loadAtomData
(
localData
[
threadIdx
.
x
],
y
*
TILE_SIZE
+
tgx
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
#endif
zeroAtomData
(
localData
[
threadIdx
.
x
]);
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
if
(
flags
==
0
)
{
// TODO: Figure out what the flags != 0 case doesn't work!!!
// if (flags != 0xFFFFFFFF) {
if
(
flags
==
0
)
{
// No interactions in this tile.
}
/* else {
// Compute only a subset of the interactions in this tile.
for (int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
real3 delta = localData[atom2].pos-data.pos;
zeroAtomData
(
localData
[
threadIdx
.
x
]);
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
localData
[
tbx
+
tj
].
pos
-
data
.
pos
;
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
computeOneInteraction
(
data
,
localData
[
tbx
+
tj
],
delta
,
false
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
field
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
x
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
y
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
z
*
0x100000000
)));
atomicAdd
(
&
fieldPolar
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolar
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldPolar
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolar
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldPolar
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolar
.
z
*
0x100000000
)));
#ifdef USE_GK
atomicAdd
(
&
fieldS
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldS
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldS
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldS
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldS
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldS
.
z
*
0x100000000
)));
atomicAdd
(
&
fieldPolarS
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolarS
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldPolarS
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolarS
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldPolarS
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolarS
.
z
*
0x100000000
)));
#endif
if
(
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
field
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
x
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
y
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
z
*
0x100000000
)));
atomicAdd
(
&
fieldPolar
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldPolar
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldPolar
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldPolar
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldPolar
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldPolar
.
z
*
0x100000000
)));
#ifdef USE_GK
atomicAdd
(
&
fieldS
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldS
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldS
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldS
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldS
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldS
.
z
*
0x100000000
)));
atomicAdd
(
&
fieldPolarS
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldPolarS
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldPolarS
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldPolarS
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldPolarS
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldPolarS
.
z
*
0x100000000
)));
#endif
real3 fields[4];
computeOneInteraction(data, localData[atom2], delta, fields);
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
data.field += fields[0];
data.fieldPolar += fields[1];
#ifdef ENABLE_SHUFFLE
for (int i = 16; i >= 1; i /= 2) {
fields[2].x += __shfl_xor(fields[2].x, i, 32);
fields[2].y += __shfl_xor(fields[2].y, i, 32);
fields[2].z += __shfl_xor(fields[2].z, i, 32);
fields[3].x += __shfl_xor(fields[3].x, i, 32);
fields[3].y += __shfl_xor(fields[3].y, i, 32);
fields[3].z += __shfl_xor(fields[3].z, i, 32);
}
if (tgx == 0) {
localData[atom2].field += fields[2];
localData[atom2].fieldPolar += fields[3];
}
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
const
unsigned
int
numTiles
=
interactionCount
[
0
];
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
warp
+
1
)
*
numTileIndices
/
totalWarps
:
(
warp
+
1
)
*
numTiles
/
totalWarps
);
#else
int bufferIndex = 3*threadIdx.x;
tempBuffer[bufferIndex] = fields[2].x;
tempBuffer[bufferIndex+1] = fields[2].y;
tempBuffer[bufferIndex+2] = fields[2].z;
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[atom2].field.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[atom2].field.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[atom2].field.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
tempBuffer[bufferIndex] = fields[3].x;
tempBuffer[bufferIndex+1] = fields[3].y;
tempBuffer[bufferIndex+2] = fields[3].z;
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[atom2].fieldPolar.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[atom2].fieldPolar.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[atom2].fieldPolar.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
const
unsigned
int
numTiles
=
numTileIndices
;
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
while
(
pos
<
end
)
{
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
}
else
#endif
}
}
}
}*/
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
AtomData
data
;
zeroAtomData
(
data
);
#ifdef USE_GK
loadAtomData
(
data
,
atom1
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
,
inducedDipoleS
,
inducedDipolePolarS
,
bornRadii
);
#else
loadAtomData
(
data
,
atom1
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
#endif
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
{
// Compute the full set of interactions in this tile.
atomIndices
[
threadIdx
.
x
]
=
j
;
#ifdef USE_GK
loadAtomData
(
localData
[
threadIdx
.
x
],
j
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
,
inducedDipoleS
,
inducedDipolePolarS
,
bornRadii
);
#else
loadAtomData
(
localData
[
threadIdx
.
x
],
j
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
#endif
zeroAtomData
(
localData
[
threadIdx
.
x
]);
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
localData
[
tbx
+
tj
].
pos
-
data
.
pos
;
// Compute the full set of interactions in this tile.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
localData
[
tbx
+
tj
].
pos
-
data
.
pos
;
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
computeOneInteraction
(
data
,
localData
[
tbx
+
tj
],
delta
,
false
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
int
atom2
=
atomIndices
[
tbx
+
tj
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
computeOneInteraction
(
data
,
localData
[
tbx
+
tj
],
delta
,
false
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
if
(
pos
<
end
)
{
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
// Write results.
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
field
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
x
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
y
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
z
*
0x100000000
)));
...
...
@@ -401,9 +427,11 @@ extern "C" __global__ void computeInducedField(
atomicAdd
(
&
fieldPolarS
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolarS
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldPolarS
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolarS
.
z
*
0x100000000
)));
#endif
}
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
#ifdef USE_CUTOFF
offset
=
atomIndices
[
threadIdx
.
x
];
#else
offset
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomicAdd
(
&
field
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
x
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
y
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
z
*
0x100000000
)));
...
...
@@ -420,7 +448,7 @@ extern "C" __global__ void computeInducedField(
#endif
}
pos
++
;
}
while
(
pos
<
end
);
}
}
extern
"C"
__global__
void
updateInducedFieldBySOR
(
const
long
long
*
__restrict__
fixedField
,
const
long
long
*
__restrict__
fixedFieldPolar
,
...
...
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
View file @
93c467b2
#define ARRAY(x,y) array[(x)-1+((y)-1)*PME_ORDER]
/**
*
This is called from updateBsplines(). It c
alculate
s
the spline coefficients for a single atom along a single axis.
*
C
alculate the spline coefficients for a single atom along a single axis.
*/
__device__
void
computeBSplinePoint
(
real4
*
thetai
,
real
w
,
real
*
array
)
{
// initialization to get to 2nd order recursion
...
...
@@ -70,15 +70,10 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
}
/**
* Compute
bspline coefficients
.
* Compute
the index of the grid point each atom is associated with
.
*/
extern
"C"
__global__
void
updateBsplines
(
const
real4
*
__restrict__
posq
,
int4
*
__restrict__
igrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
real4
*
__restrict__
theta1
,
real4
*
__restrict__
theta2
,
real4
*
__restrict__
theta3
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
extern
__shared__
real
bsplines_cache
[];
// size = block_size*pme_order*pme_order
real
*
array
=
&
bsplines_cache
[
threadIdx
.
x
*
PME_ORDER
*
PME_ORDER
];
// get the B-spline coefficients for each multipole site
extern
"C"
__global__
void
findAtomGridIndex
(
const
real4
*
__restrict__
posq
,
int2
*
__restrict__
pmeAtomGridIndex
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
real4
pos
=
posq
[
i
];
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
...
...
@@ -90,256 +85,226 @@ extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, int4*
real
w
=
pos
.
x
*
invPeriodicBoxSize
.
x
;
real
fr
=
GRID_SIZE_X
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
int
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid1
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
&
theta1
[
i
*
PME_ORDER
],
w
,
array
);
// Second axis.
w
=
pos
.
y
*
invPeriodicBoxSize
.
y
;
fr
=
GRID_SIZE_Y
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid2
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
&
theta2
[
i
*
PME_ORDER
],
w
,
array
);
// Third axis.
w
=
pos
.
z
*
invPeriodicBoxSize
.
z
;
fr
=
GRID_SIZE_Z
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid3
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
&
theta3
[
i
*
PME_ORDER
],
w
,
array
);
// Record the grid point.
igrid1
+=
(
igrid1
<
0
?
GRID_SIZE_X
:
0
);
igrid2
+=
(
igrid2
<
0
?
GRID_SIZE_Y
:
0
);
igrid3
+=
(
igrid3
<
0
?
GRID_SIZE_Z
:
0
);
igrid
[
i
]
=
make_int4
(
igrid1
,
igrid2
,
igrid3
,
0
);
pmeAtomGridIndex
[
i
]
=
make_int2
(
i
,
igrid1
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
igrid2
*
GRID_SIZE_Z
+
igrid3
);
}
}
/**
* For each grid point, find the range of sorted atoms associated with that point.
*/
extern
"C"
__global__
void
findAtomRangeForGrid
(
int2
*
__restrict__
pmeAtomGridIndex
,
int
*
__restrict__
pmeAtomRange
,
const
real4
*
__restrict__
posq
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
int
thread
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
start
=
(
NUM_ATOMS
*
thread
)
/
(
blockDim
.
x
*
gridDim
.
x
);
int
end
=
(
NUM_ATOMS
*
(
thread
+
1
))
/
(
blockDim
.
x
*
gridDim
.
x
);
int
last
=
(
start
==
0
?
-
1
:
pmeAtomGridIndex
[
start
-
1
].
y
);
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
int2
atomData
=
pmeAtomGridIndex
[
i
];
int
gridIndex
=
atomData
.
y
;
if
(
gridIndex
!=
last
)
{
for
(
int
j
=
last
+
1
;
j
<=
gridIndex
;
++
j
)
pmeAtomRange
[
j
]
=
i
;
last
=
gridIndex
;
}
}
// Fill in values beyond the last atom.
if
(
thread
==
blockDim
.
x
*
gridDim
.
x
-
1
)
{
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
for
(
int
j
=
last
+
1
;
j
<=
gridSize
;
++
j
)
pmeAtomRange
[
j
]
=
NUM_ATOMS
;
}
}
/**
* The grid index won't be needed again. Reuse that component to hold the z index, thus saving
* some work in the charge spreading kernel.
*/
extern
"C"
__global__
void
recordZIndex
(
int2
*
__restrict__
pmeAtomGridIndex
,
const
real4
*
__restrict__
posq
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
int
thread
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
start
=
(
NUM_ATOMS
*
thread
)
/
(
blockDim
.
x
*
gridDim
.
x
);
int
end
=
(
NUM_ATOMS
*
(
thread
+
1
))
/
(
blockDim
.
x
*
gridDim
.
x
);
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
real
posz
=
posq
[
pmeAtomGridIndex
[
i
].
x
].
z
;
posz
-=
floor
(
posz
*
invPeriodicBoxSize
.
z
)
*
periodicBoxSize
.
z
;
real
w
=
posz
*
invPeriodicBoxSize
.
z
;
real
fr
=
GRID_SIZE_Z
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
int
z
=
((
int
)
fr
)
-
PME_ORDER
+
1
;
pmeAtomGridIndex
[
i
].
y
=
z
;
}
}
extern
"C"
__global__
void
gridSpreadFixedMultipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
int
*
__restrict__
pmeAtomRange
,
const
real4
*
__restrict__
theta1
,
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
real4
invPeriodicBoxSize
)
{
const
real
*
__restrict__
labFrameQuadrupole
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
const
real
xscale
=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
;
const
real
yscale
=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
;
const
real
zscale
=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
;
unsigned
int
numGridPoints
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
unsigned
int
numThreads
=
gridDim
.
x
*
blockDim
.
x
;
for
(
int
gridIndex
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
gridIndex
<
numGridPoints
;
gridIndex
+=
numThreads
)
{
int3
gridPoint
;
gridPoint
.
x
=
gridIndex
/
(
GRID_SIZE_Y
*
GRID_SIZE_Z
);
int
remainder
=
gridIndex
-
gridPoint
.
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
gridPoint
.
y
=
remainder
/
GRID_SIZE_Z
;
gridPoint
.
z
=
remainder
-
gridPoint
.
y
*
GRID_SIZE_Z
;
real
result
=
0
;
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
++
ix
)
{
int
x
=
gridPoint
.
x
-
ix
+
(
gridPoint
.
x
>=
ix
?
0
:
GRID_SIZE_X
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
++
iy
)
{
int
y
=
gridPoint
.
y
-
iy
+
(
gridPoint
.
y
>=
iy
?
0
:
GRID_SIZE_Y
);
int
z1
=
gridPoint
.
z
-
PME_ORDER
+
1
;
z1
+=
(
z1
>=
0
?
0
:
GRID_SIZE_Z
);
int
z2
=
(
z1
<
gridPoint
.
z
?
gridPoint
.
z
:
GRID_SIZE_Z
-
1
);
int
gridIndex1
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
z1
;
int
gridIndex2
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
z2
;
int
firstAtom
=
pmeAtomRange
[
gridIndex1
];
int
lastAtom
=
pmeAtomRange
[
gridIndex2
+
1
];
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
int2
atomData
=
pmeAtomGridIndex
[
i
];
int
atomIndex
=
atomData
.
x
;
int
z
=
atomData
.
y
;
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
GRID_SIZE_Z
);
if
(
iz
>=
GRID_SIZE_Z
)
iz
-=
GRID_SIZE_Z
;
real
atomCharge
=
posq
[
atomIndex
].
w
;
real
atomDipoleX
=
xscale
*
labFrameDipole
[
atomIndex
*
3
];
real
atomDipoleY
=
yscale
*
labFrameDipole
[
atomIndex
*
3
+
1
];
real
atomDipoleZ
=
zscale
*
labFrameDipole
[
atomIndex
*
3
+
2
];
real
atomQuadrupoleXX
=
xscale
*
xscale
*
labFrameQuadrupole
[
atomIndex
*
5
];
real
atomQuadrupoleXY
=
2
*
xscale
*
yscale
*
labFrameQuadrupole
[
atomIndex
*
5
+
1
];
real
atomQuadrupoleXZ
=
2
*
xscale
*
zscale
*
labFrameQuadrupole
[
atomIndex
*
5
+
2
];
real
atomQuadrupoleYY
=
yscale
*
yscale
*
labFrameQuadrupole
[
atomIndex
*
5
+
3
];
real
atomQuadrupoleYZ
=
2
*
yscale
*
zscale
*
labFrameQuadrupole
[
atomIndex
*
5
+
4
];
real
atomQuadrupoleZZ
=
-
zscale
*
zscale
*
(
labFrameQuadrupole
[
atomIndex
*
5
]
+
labFrameQuadrupole
[
atomIndex
*
5
+
3
]);
real4
t
=
theta1
[
atomIndex
*
PME_ORDER
+
ix
];
real4
u
=
theta2
[
atomIndex
*
PME_ORDER
+
iy
];
real4
v
=
theta3
[
atomIndex
*
PME_ORDER
+
iz
];
real
array
[
PME_ORDER
*
PME_ORDER
];
real4
theta1
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
m
=
pmeAtomGridIndex
[
i
].
x
;
real4
pos
=
posq
[
m
];
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
(
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
periodicBoxSize
.
z
;
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real
w
=
pos
.
x
*
invPeriodicBoxSize
.
x
;
real
fr
=
GRID_SIZE_X
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
int
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid1
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta1
,
w
,
array
);
w
=
pos
.
y
*
invPeriodicBoxSize
.
y
;
fr
=
GRID_SIZE_Y
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid2
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta2
,
w
,
array
);
w
=
pos
.
z
*
invPeriodicBoxSize
.
z
;
fr
=
GRID_SIZE_Z
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid3
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta3
,
w
,
array
);
igrid1
+=
(
igrid1
<
0
?
GRID_SIZE_X
:
0
);
igrid2
+=
(
igrid2
<
0
?
GRID_SIZE_Y
:
0
);
igrid3
+=
(
igrid3
<
0
?
GRID_SIZE_Z
:
0
);
// Spread the charge from this atom onto each grid point.
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
igrid1
+
ix
;
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
xbase
=
xbase
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real4
t
=
theta1
[
ix
];
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
igrid2
+
iy
;
ybase
-=
(
ybase
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
ybase
=
xbase
+
ybase
*
GRID_SIZE_Z
;
real4
u
=
theta2
[
iy
];
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
zindex
=
igrid3
+
iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
int
index
=
ybase
+
zindex
;
real4
v
=
theta3
[
iz
];
real
atomCharge
=
pos
.
w
;
real
atomDipoleX
=
xscale
*
labFrameDipole
[
m
*
3
];
real
atomDipoleY
=
yscale
*
labFrameDipole
[
m
*
3
+
1
];
real
atomDipoleZ
=
zscale
*
labFrameDipole
[
m
*
3
+
2
];
real
atomQuadrupoleXX
=
xscale
*
xscale
*
labFrameQuadrupole
[
m
*
5
];
real
atomQuadrupoleXY
=
2
*
xscale
*
yscale
*
labFrameQuadrupole
[
m
*
5
+
1
];
real
atomQuadrupoleXZ
=
2
*
xscale
*
zscale
*
labFrameQuadrupole
[
m
*
5
+
2
];
real
atomQuadrupoleYY
=
yscale
*
yscale
*
labFrameQuadrupole
[
m
*
5
+
3
];
real
atomQuadrupoleYZ
=
2
*
yscale
*
zscale
*
labFrameQuadrupole
[
m
*
5
+
4
];
real
atomQuadrupoleZZ
=
-
zscale
*
zscale
*
(
labFrameQuadrupole
[
m
*
5
]
+
labFrameQuadrupole
[
m
*
5
+
3
]);
real
term0
=
atomCharge
*
u
.
x
*
v
.
x
+
atomDipoleY
*
u
.
y
*
v
.
x
+
atomDipoleZ
*
u
.
x
*
v
.
y
+
atomQuadrupoleYY
*
u
.
z
*
v
.
x
+
atomQuadrupoleZZ
*
u
.
x
*
v
.
z
+
atomQuadrupoleYZ
*
u
.
y
*
v
.
y
;
real
term1
=
atomDipoleX
*
u
.
x
*
v
.
x
+
atomQuadrupoleXY
*
u
.
y
*
v
.
x
+
atomQuadrupoleXZ
*
u
.
x
*
v
.
y
;
real
term2
=
atomQuadrupoleXX
*
u
.
x
*
v
.
x
;
result
+=
term0
*
t
.
x
+
term1
*
t
.
y
+
term2
*
t
.
z
;
}
if
(
z1
>
gridPoint
.
z
)
{
gridIndex1
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
;
gridIndex2
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
gridPoint
.
z
;
firstAtom
=
pmeAtomRange
[
gridIndex1
];
lastAtom
=
pmeAtomRange
[
gridIndex2
+
1
];
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
int2
atomData
=
pmeAtomGridIndex
[
i
];
int
atomIndex
=
atomData
.
x
;
int
z
=
atomData
.
y
;
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
GRID_SIZE_Z
);
if
(
iz
>=
GRID_SIZE_Z
)
iz
-=
GRID_SIZE_Z
;
real
atomCharge
=
posq
[
atomIndex
].
w
;
real
atomDipoleX
=
xscale
*
labFrameDipole
[
atomIndex
*
3
];
real
atomDipoleY
=
yscale
*
labFrameDipole
[
atomIndex
*
3
+
1
];
real
atomDipoleZ
=
zscale
*
labFrameDipole
[
atomIndex
*
3
+
2
];
real
atomQuadrupoleXX
=
xscale
*
xscale
*
labFrameQuadrupole
[
atomIndex
*
5
];
real
atomQuadrupoleXY
=
2
*
xscale
*
yscale
*
labFrameQuadrupole
[
atomIndex
*
5
+
1
];
real
atomQuadrupoleXZ
=
2
*
xscale
*
zscale
*
labFrameQuadrupole
[
atomIndex
*
5
+
2
];
real
atomQuadrupoleYY
=
yscale
*
yscale
*
labFrameQuadrupole
[
atomIndex
*
5
+
3
];
real
atomQuadrupoleYZ
=
2
*
yscale
*
zscale
*
labFrameQuadrupole
[
atomIndex
*
5
+
4
];
real
atomQuadrupoleZZ
=
-
zscale
*
zscale
*
(
labFrameQuadrupole
[
atomIndex
*
5
]
+
labFrameQuadrupole
[
atomIndex
*
5
+
3
]);
real4
t
=
theta1
[
atomIndex
*
PME_ORDER
+
ix
];
real4
u
=
theta2
[
atomIndex
*
PME_ORDER
+
iy
];
real4
v
=
theta3
[
atomIndex
*
PME_ORDER
+
iz
];
real
term0
=
atomCharge
*
u
.
x
*
v
.
x
+
atomDipoleY
*
u
.
y
*
v
.
x
+
atomDipoleZ
*
u
.
x
*
v
.
y
+
atomQuadrupoleYY
*
u
.
z
*
v
.
x
+
atomQuadrupoleZZ
*
u
.
x
*
v
.
z
+
atomQuadrupoleYZ
*
u
.
y
*
v
.
y
;
real
term1
=
atomDipoleX
*
u
.
x
*
v
.
x
+
atomQuadrupoleXY
*
u
.
y
*
v
.
x
+
atomQuadrupoleXZ
*
u
.
x
*
v
.
y
;
real
term2
=
atomQuadrupoleXX
*
u
.
x
*
v
.
x
;
result
+=
term0
*
t
.
x
+
term1
*
t
.
y
+
term2
*
t
.
z
;
}
real
add
=
term0
*
t
.
x
+
term1
*
t
.
y
+
term2
*
t
.
z
;
#ifdef USE_DOUBLE_PRECISION
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
pmeGrid
;
atomicAdd
(
&
ulonglong_p
[
2
*
index
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
#else
atomicAdd
(
&
pmeGrid
[
index
].
x
,
add
);
#endif
}
}
}
pmeGrid
[
gridIndex
]
=
make_real2
(
result
,
0
);
}
}
extern
"C"
__global__
void
gridSpreadInducedDipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
int
*
__restrict__
pmeAtomRange
,
const
real4
*
__restrict__
theta1
,
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
real4
invPeriodicBoxSize
)
{
const
real
*
__restrict__
inducedDipolePolar
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
const
real
xscale
=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
;
const
real
yscale
=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
;
const
real
zscale
=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
;
unsigned
int
numGridPoints
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
unsigned
int
numThreads
=
gridDim
.
x
*
blockDim
.
x
;
for
(
int
gridIndex
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
gridIndex
<
numGridPoints
;
gridIndex
+=
numThreads
)
{
int3
gridPoint
;
gridPoint
.
x
=
gridIndex
/
(
GRID_SIZE_Y
*
GRID_SIZE_Z
);
int
remainder
=
gridIndex
-
gridPoint
.
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
gridPoint
.
y
=
remainder
/
GRID_SIZE_Z
;
gridPoint
.
z
=
remainder
-
gridPoint
.
y
*
GRID_SIZE_Z
;
real2
result
=
make_real2
(
0
,
0
);
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
++
ix
)
{
int
x
=
gridPoint
.
x
-
ix
+
(
gridPoint
.
x
>=
ix
?
0
:
GRID_SIZE_X
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
++
iy
)
{
int
y
=
gridPoint
.
y
-
iy
+
(
gridPoint
.
y
>=
iy
?
0
:
GRID_SIZE_Y
);
int
z1
=
gridPoint
.
z
-
PME_ORDER
+
1
;
z1
+=
(
z1
>=
0
?
0
:
GRID_SIZE_Z
);
int
z2
=
(
z1
<
gridPoint
.
z
?
gridPoint
.
z
:
GRID_SIZE_Z
-
1
);
int
gridIndex1
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
z1
;
int
gridIndex2
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
z2
;
int
firstAtom
=
pmeAtomRange
[
gridIndex1
];
int
lastAtom
=
pmeAtomRange
[
gridIndex2
+
1
];
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
int2
atomData
=
pmeAtomGridIndex
[
i
];
int
atomIndex
=
atomData
.
x
;
int
z
=
atomData
.
y
;
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
GRID_SIZE_Z
);
if
(
iz
>=
GRID_SIZE_Z
)
iz
-=
GRID_SIZE_Z
;
real
inducedDipoleX
=
xscale
*
inducedDipole
[
atomIndex
*
3
];
real
inducedDipoleY
=
yscale
*
inducedDipole
[
atomIndex
*
3
+
1
];
real
inducedDipoleZ
=
zscale
*
inducedDipole
[
atomIndex
*
3
+
2
];
real
inducedDipolePolarX
=
xscale
*
inducedDipolePolar
[
atomIndex
*
3
];
real
inducedDipolePolarY
=
yscale
*
inducedDipolePolar
[
atomIndex
*
3
+
1
];
real
inducedDipolePolarZ
=
zscale
*
inducedDipolePolar
[
atomIndex
*
3
+
2
];
real4
t
=
theta1
[
atomIndex
*
PME_ORDER
+
ix
];
real4
u
=
theta2
[
atomIndex
*
PME_ORDER
+
iy
];
real4
v
=
theta3
[
atomIndex
*
PME_ORDER
+
iz
];
real
array
[
PME_ORDER
*
PME_ORDER
];
real4
theta1
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
m
=
pmeAtomGridIndex
[
i
].
x
;
real4
pos
=
posq
[
m
];
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
(
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
periodicBoxSize
.
z
;
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real
w
=
pos
.
x
*
invPeriodicBoxSize
.
x
;
real
fr
=
GRID_SIZE_X
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
int
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid1
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta1
,
w
,
array
);
w
=
pos
.
y
*
invPeriodicBoxSize
.
y
;
fr
=
GRID_SIZE_Y
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid2
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta2
,
w
,
array
);
w
=
pos
.
z
*
invPeriodicBoxSize
.
z
;
fr
=
GRID_SIZE_Z
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid3
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta3
,
w
,
array
);
igrid1
+=
(
igrid1
<
0
?
GRID_SIZE_X
:
0
);
igrid2
+=
(
igrid2
<
0
?
GRID_SIZE_Y
:
0
);
igrid3
+=
(
igrid3
<
0
?
GRID_SIZE_Z
:
0
);
// Spread the charge from this atom onto each grid point.
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
igrid1
+
ix
;
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
xbase
=
xbase
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real4
t
=
theta1
[
ix
];
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
igrid2
+
iy
;
ybase
-=
(
ybase
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
ybase
=
xbase
+
ybase
*
GRID_SIZE_Z
;
real4
u
=
theta2
[
iy
];
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
zindex
=
igrid3
+
iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
int
index
=
ybase
+
zindex
;
real4
v
=
theta3
[
iz
];
real
inducedDipoleX
=
xscale
*
inducedDipole
[
m
*
3
];
real
inducedDipoleY
=
yscale
*
inducedDipole
[
m
*
3
+
1
];
real
inducedDipoleZ
=
zscale
*
inducedDipole
[
m
*
3
+
2
];
real
inducedDipolePolarX
=
xscale
*
inducedDipolePolar
[
m
*
3
];
real
inducedDipolePolarY
=
yscale
*
inducedDipolePolar
[
m
*
3
+
1
];
real
inducedDipolePolarZ
=
zscale
*
inducedDipolePolar
[
m
*
3
+
2
];
real
term01
=
inducedDipoleY
*
u
.
y
*
v
.
x
+
inducedDipoleZ
*
u
.
x
*
v
.
y
;
real
term11
=
inducedDipoleX
*
u
.
x
*
v
.
x
;
real
term02
=
inducedDipolePolarY
*
u
.
y
*
v
.
x
+
inducedDipolePolarZ
*
u
.
x
*
v
.
y
;
real
term12
=
inducedDipolePolarX
*
u
.
x
*
v
.
x
;
result
.
x
+=
term01
*
t
.
x
+
term11
*
t
.
y
;
result
.
y
+=
term02
*
t
.
x
+
term12
*
t
.
y
;
}
if
(
z1
>
gridPoint
.
z
)
{
gridIndex1
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
;
gridIndex2
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
gridPoint
.
z
;
firstAtom
=
pmeAtomRange
[
gridIndex1
];
lastAtom
=
pmeAtomRange
[
gridIndex2
+
1
];
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
int2
atomData
=
pmeAtomGridIndex
[
i
];
int
atomIndex
=
atomData
.
x
;
int
z
=
atomData
.
y
;
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
GRID_SIZE_Z
);
if
(
iz
>=
GRID_SIZE_Z
)
iz
-=
GRID_SIZE_Z
;
real
inducedDipoleX
=
xscale
*
inducedDipole
[
atomIndex
*
3
];
real
inducedDipoleY
=
yscale
*
inducedDipole
[
atomIndex
*
3
+
1
];
real
inducedDipoleZ
=
zscale
*
inducedDipole
[
atomIndex
*
3
+
2
];
real
inducedDipolePolarX
=
xscale
*
inducedDipolePolar
[
atomIndex
*
3
];
real
inducedDipolePolarY
=
yscale
*
inducedDipolePolar
[
atomIndex
*
3
+
1
];
real
inducedDipolePolarZ
=
zscale
*
inducedDipolePolar
[
atomIndex
*
3
+
2
];
real4
t
=
theta1
[
atomIndex
*
PME_ORDER
+
ix
];
real4
u
=
theta2
[
atomIndex
*
PME_ORDER
+
iy
];
real4
v
=
theta3
[
atomIndex
*
PME_ORDER
+
iz
];
real
term01
=
inducedDipoleY
*
u
.
y
*
v
.
x
+
inducedDipoleZ
*
u
.
x
*
v
.
y
;
real
term11
=
inducedDipoleX
*
u
.
x
*
v
.
x
;
real
term02
=
inducedDipolePolarY
*
u
.
y
*
v
.
x
+
inducedDipolePolarZ
*
u
.
x
*
v
.
y
;
real
term12
=
inducedDipolePolarX
*
u
.
x
*
v
.
x
;
result
.
x
+=
term01
*
t
.
x
+
term11
*
t
.
y
;
result
.
y
+=
term02
*
t
.
x
+
term12
*
t
.
y
;
}
real
add1
=
term01
*
t
.
x
+
term11
*
t
.
y
;
real
add2
=
term02
*
t
.
x
+
term12
*
t
.
y
;
#ifdef USE_DOUBLE_PRECISION
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
pmeGrid
;
atomicAdd
(
&
ulonglong_p
[
2
*
index
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add1
*
0x100000000
)));
atomicAdd
(
&
ulonglong_p
[
2
*
index
+
1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add2
*
0x100000000
)));
#else
atomicAdd
(
&
pmeGrid
[
index
].
x
,
add1
);
atomicAdd
(
&
pmeGrid
[
index
].
y
,
add2
);
#endif
}
}
}
pmeGrid
[
gridIndex
]
=
result
;
}
}
/**
* In double precision, we have to use fixed point to accumulate the grid values, so convert them to floating point.
*/
extern
"C"
__global__
void
finishSpreadCharge
(
long
long
*
__restrict__
pmeGrid
)
{
real
*
floatGrid
=
(
real
*
)
pmeGrid
;
const
unsigned
int
gridSize
=
2
*
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real
scale
=
1
/
(
real
)
0x100000000
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
floatGrid
[
index
]
=
scale
*
pmeGrid
[
index
];
}
extern
"C"
__global__
void
reciprocalConvolution
(
real2
*
__restrict__
pmeGrid
,
const
real
*
__restrict__
pmeBsplineModuliX
,
const
real
*
__restrict__
pmeBsplineModuliY
,
const
real
*
__restrict__
pmeBsplineModuliZ
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
...
...
@@ -372,12 +337,50 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
}
extern
"C"
__global__
void
computeFixedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phi
,
long
long
*
__restrict__
fieldBuffers
,
long
long
*
__restrict__
fieldPolarBuffers
,
const
int4
*
__restrict__
igrid
,
const
real4
*
__restrict__
theta1
,
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
const
real
*
__restrict__
labFrameDipole
,
real4
invPeriodicBoxSize
)
{
// extract the permanent multipole field at each site
long
long
*
__restrict__
fieldBuffers
,
long
long
*
__restrict__
fieldPolarBuffers
,
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
labFrameDipole
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
int2
*
__restrict__
pmeAtomGridIndex
)
{
real
array
[
PME_ORDER
*
PME_ORDER
];
real4
theta1
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
m
=
pmeAtomGridIndex
[
i
].
x
;
real4
pos
=
posq
[
m
];
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
(
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
periodicBoxSize
.
z
;
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real
w
=
pos
.
x
*
invPeriodicBoxSize
.
x
;
real
fr
=
GRID_SIZE_X
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
int
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid1
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta1
,
w
,
array
);
w
=
pos
.
y
*
invPeriodicBoxSize
.
y
;
fr
=
GRID_SIZE_Y
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid2
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta2
,
w
,
array
);
w
=
pos
.
z
*
invPeriodicBoxSize
.
z
;
fr
=
GRID_SIZE_Z
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid3
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta3
,
w
,
array
);
igrid1
+=
(
igrid1
<
0
?
GRID_SIZE_X
:
0
);
igrid2
+=
(
igrid2
<
0
?
GRID_SIZE_Y
:
0
);
igrid3
+=
(
igrid3
<
0
?
GRID_SIZE_Z
:
0
);
// Compute the potential from this grid point.
for
(
int
m
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
m
<
NUM_ATOMS
;
m
+=
blockDim
.
x
*
gridDim
.
x
)
{
int4
gridPoint
=
igrid
[
m
];
real
tuv000
=
0
;
real
tuv001
=
0
;
real
tuv010
=
0
;
...
...
@@ -399,8 +402,8 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
real
tuv012
=
0
;
real
tuv111
=
0
;
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
k
=
grid
Point
.
z
+
iz
-
(
grid
Point
.
z
+
iz
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
real4
v
=
theta3
[
m
*
PME_ORDER
+
iz
];
int
k
=
i
grid
3
+
iz
-
(
i
grid
3
+
iz
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
real4
v
=
theta3
[
iz
];
real
tu00
=
0
;
real
tu10
=
0
;
real
tu01
=
0
;
...
...
@@ -412,14 +415,14 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
real
tu12
=
0
;
real
tu03
=
0
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
j
=
grid
Point
.
y
+
iy
-
(
grid
Point
.
y
+
iy
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
real4
u
=
theta2
[
m
*
PME_ORDER
+
iy
];
int
j
=
i
grid
2
+
iy
-
(
i
grid
2
+
iy
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
real4
u
=
theta2
[
iy
];
real4
t
=
make_real4
(
0
,
0
,
0
,
0
);
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
i
=
grid
Point
.
x
+
ix
-
(
grid
Point
.
x
+
ix
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
int
i
=
i
grid
1
+
ix
-
(
i
grid
1
+
ix
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
int
gridIndex
=
i
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
j
*
GRID_SIZE_Z
+
k
;
real
tq
=
pmeGrid
[
gridIndex
].
x
;
real4
tadd
=
theta1
[
m
*
PME_ORDER
+
ix
];
real4
tadd
=
theta1
[
ix
];
t
.
x
+=
tq
*
tadd
.
x
;
t
.
y
+=
tq
*
tadd
.
y
;
t
.
z
+=
tq
*
tadd
.
z
;
...
...
@@ -491,12 +494,50 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
}
extern
"C"
__global__
void
computeInducedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phid
,
real
*
__restrict__
phip
,
real
*
__restrict__
phidp
,
const
int4
*
__restrict__
igrid
,
const
real4
*
__restrict__
theta1
,
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
real4
invPeriodicBoxSize
)
{
// extract the induced dipole field at each site
real
*
__restrict__
phip
,
real
*
__restrict__
phidp
,
const
real4
*
__restrict__
posq
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
int2
*
__restrict__
pmeAtomGridIndex
)
{
real
array
[
PME_ORDER
*
PME_ORDER
];
real4
theta1
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
m
=
pmeAtomGridIndex
[
i
].
x
;
real4
pos
=
posq
[
m
];
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
(
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
periodicBoxSize
.
z
;
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real
w
=
pos
.
x
*
invPeriodicBoxSize
.
x
;
real
fr
=
GRID_SIZE_X
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
int
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid1
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta1
,
w
,
array
);
w
=
pos
.
y
*
invPeriodicBoxSize
.
y
;
fr
=
GRID_SIZE_Y
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid2
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta2
,
w
,
array
);
w
=
pos
.
z
*
invPeriodicBoxSize
.
z
;
fr
=
GRID_SIZE_Z
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid3
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta3
,
w
,
array
);
igrid1
+=
(
igrid1
<
0
?
GRID_SIZE_X
:
0
);
igrid2
+=
(
igrid2
<
0
?
GRID_SIZE_Y
:
0
);
igrid3
+=
(
igrid3
<
0
?
GRID_SIZE_Z
:
0
);
// Compute the potential from this grid point.
for
(
int
m
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
m
<
NUM_ATOMS
;
m
+=
blockDim
.
x
*
gridDim
.
x
)
{
int4
gridPoint
=
igrid
[
m
];
real
tuv100_1
=
0
;
real
tuv010_1
=
0
;
real
tuv001_1
=
0
;
...
...
@@ -536,8 +577,8 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real
tuv012
=
0
;
real
tuv111
=
0
;
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
k
=
grid
Point
.
z
+
iz
-
(
grid
Point
.
z
+
iz
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
real4
v
=
theta3
[
m
*
PME_ORDER
+
iz
];
int
k
=
i
grid
3
+
iz
-
(
i
grid
3
+
iz
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
real4
v
=
theta3
[
iz
];
real
tu00_1
=
0
;
real
tu01_1
=
0
;
real
tu10_1
=
0
;
...
...
@@ -561,8 +602,8 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real
tu12
=
0
;
real
tu03
=
0
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
j
=
grid
Point
.
y
+
iy
-
(
grid
Point
.
y
+
iy
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
real4
u
=
theta2
[
m
*
PME_ORDER
+
iy
];
int
j
=
i
grid
2
+
iy
-
(
i
grid
2
+
iy
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
real4
u
=
theta2
[
iy
];
real
t0_1
=
0
;
real
t1_1
=
0
;
real
t2_1
=
0
;
...
...
@@ -571,10 +612,10 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real
t2_2
=
0
;
real
t3
=
0
;
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
i
=
grid
Point
.
x
+
ix
-
(
grid
Point
.
x
+
ix
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
int
i
=
i
grid
1
+
ix
-
(
i
grid
1
+
ix
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
int
gridIndex
=
i
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
j
*
GRID_SIZE_Z
+
k
;
real2
tq
=
pmeGrid
[
gridIndex
];
real4
tadd
=
theta1
[
m
*
PME_ORDER
+
ix
];
real4
tadd
=
theta1
[
ix
];
t0_1
+=
tq
.
x
*
tadd
.
x
;
t1_1
+=
tq
.
x
*
tadd
.
y
;
t2_1
+=
tq
.
x
*
tadd
.
z
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
View file @
93c467b2
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef
struct
{
...
...
@@ -182,253 +181,223 @@ __device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy) {
*/
extern
"C"
__global__
void
computeElectrostatics
(
unsigned
long
long
*
__restrict__
forceBuffers
,
unsigned
long
long
*
__restrict__
torqueBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
u
nsigned
int
*
__restrict__
exclusionIndice
s
,
const
unsigned
int
*
__restrict__
exclusionRowIndice
s
,
const
u
in
t2
*
__restrict__
covalentFlags
,
const
unsigned
int
*
__restrict__
polarizationGroupFlag
s
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
const
real4
*
__restrict__
posq
,
const
uint
2
*
__restrict__
covalentFlag
s
,
const
unsigned
int
*
__restrict__
polarizationGroupFlag
s
,
const
u
shor
t2
*
__restrict__
exclusionTile
s
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
,
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
,
#endif
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
const
float2
*
__restrict__
dampingAndThole
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
real
energy
=
0
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
AtomData
data
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
loadAtomData
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
data
.
force
=
make_real3
(
0
);
data
.
torque
=
make_real3
(
0
);
uint2
covalent
=
covalentFlags
[
pos
*
TILE_SIZE
+
tgx
];
unsigned
int
polarizationGroup
=
polarizationGroupFlags
[
pos
*
TILE_SIZE
+
tgx
];
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
pos
=
data
.
pos
;
localData
[
threadIdx
.
x
].
q
=
data
.
q
;
localData
[
threadIdx
.
x
].
dipole
=
data
.
dipole
;
localData
[
threadIdx
.
x
].
quadrupoleXX
=
data
.
quadrupoleXX
;
localData
[
threadIdx
.
x
].
quadrupoleXY
=
data
.
quadrupoleXY
;
localData
[
threadIdx
.
x
].
quadrupoleXZ
=
data
.
quadrupoleXZ
;
localData
[
threadIdx
.
x
].
quadrupoleYY
=
data
.
quadrupoleYY
;
localData
[
threadIdx
.
x
].
quadrupoleYZ
=
data
.
quadrupoleYZ
;
localData
[
threadIdx
.
x
].
inducedDipole
=
data
.
inducedDipole
;
localData
[
threadIdx
.
x
].
inducedDipolePolar
=
data
.
inducedDipolePolar
;
localData
[
threadIdx
.
x
].
thole
=
data
.
thole
;
localData
[
threadIdx
.
x
].
damp
=
data
.
damp
;
// Compute forces.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
float
m
=
computeMScaleFactor
(
covalent
,
j
);
computeOneInteraction
(
data
,
localData
[
tbx
+
j
],
true
,
d
,
p
,
m
,
0.5
f
,
energy
,
periodicBoxSize
,
invPeriodicBoxSize
);
}
}
if
(
atom1
<
NUM_ATOMS
)
computeSelfEnergyAndTorque
(
data
,
energy
);
data
.
force
*=
-
ENERGY_SCALE_FACTOR
;
data
.
torque
*=
ENERGY_SCALE_FACTOR
;
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
z
*
0x100000000
)));
}
else
{
// This is an off-diagonal tile.
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
loadAtomData
(
localData
[
threadIdx
.
x
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
localData
[
threadIdx
.
x
].
torque
=
make_real3
(
0
);
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
float
m
=
computeMScaleFactor
(
covalent
,
tj
);
computeOneInteraction
(
data
,
localData
[
tbx
+
tj
],
true
,
d
,
p
,
m
,
1
,
energy
,
periodicBoxSize
,
invPeriodicBoxSize
);
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
-
ENERGY_SCALE_FACTOR
;
data
.
torque
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
-
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
torque
*=
ENERGY_SCALE_FACTOR
;
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
z
*
0x100000000
)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
const
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
unsigned
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
warp
+
1
)
*
numTileIndices
/
totalWarps
:
(
warp
+
1
)
*
numTiles
/
totalWarps
);
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
warp
+
1
)
*
numTileIndices
/
totalWarps
:
(
warp
+
1
)
*
numTiles
/
totalWarps
);
#else
const
unsigned
int
numTiles
=
numTileIndices
;
unsigned
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
real
energy
=
0
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
#ifndef ENABLE_SHUFFLE
__shared__
real
tempBuffer
[
3
*
THREAD_BLOCK_SIZE
];
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
do
{
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
while
(
pos
<
end
)
{
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
AtomData
data
;
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
AtomData
data
;
loadAtomData
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
data
.
force
=
make_real3
(
0
);
data
.
torque
=
make_real3
(
0
);
// Locate the exclusion data for this tile.
if
(
tgx
<
2
)
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex
[
localGroupIndex
]
>
-
1
);
if
(
pos
>=
end
)
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
pos
=
data
.
pos
;
localData
[
threadIdx
.
x
].
q
=
data
.
q
;
localData
[
threadIdx
.
x
].
dipole
=
data
.
dipole
;
localData
[
threadIdx
.
x
].
quadrupoleXX
=
data
.
quadrupoleXX
;
localData
[
threadIdx
.
x
].
quadrupoleXY
=
data
.
quadrupoleXY
;
localData
[
threadIdx
.
x
].
quadrupoleXZ
=
data
.
quadrupoleXZ
;
localData
[
threadIdx
.
x
].
quadrupoleYY
=
data
.
quadrupoleYY
;
localData
[
threadIdx
.
x
].
quadrupoleYZ
=
data
.
quadrupoleYZ
;
localData
[
threadIdx
.
x
].
inducedDipole
=
data
.
inducedDipole
;
localData
[
threadIdx
.
x
].
inducedDipolePolar
=
data
.
inducedDipolePolar
;
localData
[
threadIdx
.
x
].
thole
=
data
.
thole
;
localData
[
threadIdx
.
x
].
damp
=
data
.
damp
;
uint2
covalent
=
covalentFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
unsigned
int
polarizationGroup
=
polarizationGroupFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
// Compute forces.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
float
m
=
computeMScaleFactor
(
covalent
,
j
);
computeOneInteraction
(
data
,
localData
[
tbx
+
j
],
hasExclusions
,
d
,
p
,
m
,
0.5
f
,
energy
,
periodicBoxSize
,
invPeriodicBoxSize
);
}
}
if
(
atom1
<
NUM_ATOMS
)
computeSelfEnergyAndTorque
(
data
,
energy
);
data
.
force
*=
-
ENERGY_SCALE_FACTOR
;
data
.
torque
*=
ENERGY_SCALE_FACTOR
;
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
z
*
0x100000000
)));
}
else
{
// This is an off-diagonal tile.
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
loadAtomData
(
localData
[
threadIdx
.
x
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
localData
[
threadIdx
.
x
].
torque
=
make_real3
(
0
);
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
if
(
!
hasExclusions
&&
flags
==
0
)
{
// TODO: Why doesn't the flags != 0 block work?
// if (!hasExclusions && flags != 0xFFFFFFFF) {
if
(
flags
==
0
)
{
// No interactions in this tile.
}
else
{
// Compute only a subset of the interactions in this tile.
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
int
atom2
=
tbx
+
j
;
real3
oldForce
=
localData
[
atom2
].
force
;
real3
oldTorque
=
localData
[
atom2
].
torque
;
localData
[
atom2
].
force
=
make_real3
(
0
);
localData
[
atom2
].
torque
=
make_real3
(
0
);
computeOneInteraction
(
data
,
localData
[
tbx
+
j
],
false
,
1
,
1
,
1
,
1
,
energy
,
periodicBoxSize
,
invPeriodicBoxSize
);
real3
newForce
=
localData
[
atom2
].
force
;
real3
newTorque
=
localData
[
atom2
].
torque
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#ifdef ENABLE_SHUFFLE
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
{
newForce
.
x
+=
__shfl_xor
(
newForce
.
x
,
i
,
32
);
newForce
.
y
+=
__shfl_xor
(
newForce
.
y
,
i
,
32
);
newForce
.
z
+=
__shfl_xor
(
newForce
.
z
,
i
,
32
);
newTorque
.
x
+=
__shfl_xor
(
newTorque
.
x
,
i
,
32
);
newTorque
.
y
+=
__shfl_xor
(
newTorque
.
y
,
i
,
32
);
newTorque
.
z
+=
__shfl_xor
(
newTorque
.
z
,
i
,
32
);
}
if
(
tgx
==
0
)
{
localData
[
atom2
].
force
-=
newForce
;
localData
[
atom2
].
torque
-=
newTorque
;
}
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
int
bufferIndex
=
3
*
threadIdx
.
x
;
tempBuffer
[
bufferIndex
]
=
newForce
.
x
;
tempBuffer
[
bufferIndex
+
1
]
=
newForce
.
y
;
tempBuffer
[
bufferIndex
+
2
]
=
newForce
.
z
;
if
(
tgx
%
4
==
0
)
{
tempBuffer
[
bufferIndex
]
+=
tempBuffer
[
bufferIndex
+
3
]
+
tempBuffer
[
bufferIndex
+
6
]
+
tempBuffer
[
bufferIndex
+
9
];
tempBuffer
[
bufferIndex
+
1
]
+=
tempBuffer
[
bufferIndex
+
4
]
+
tempBuffer
[
bufferIndex
+
7
]
+
tempBuffer
[
bufferIndex
+
10
];
tempBuffer
[
bufferIndex
+
2
]
+=
tempBuffer
[
bufferIndex
+
5
]
+
tempBuffer
[
bufferIndex
+
8
]
+
tempBuffer
[
bufferIndex
+
11
];
}
if
(
tgx
==
0
)
{
localData
[
atom2
].
force
.
x
-=
tempBuffer
[
bufferIndex
]
+
tempBuffer
[
bufferIndex
+
12
]
+
tempBuffer
[
bufferIndex
+
24
]
+
tempBuffer
[
bufferIndex
+
36
]
+
tempBuffer
[
bufferIndex
+
48
]
+
tempBuffer
[
bufferIndex
+
60
]
+
tempBuffer
[
bufferIndex
+
72
]
+
tempBuffer
[
bufferIndex
+
84
];
localData
[
atom2
].
force
.
y
-=
tempBuffer
[
bufferIndex
+
1
]
+
tempBuffer
[
bufferIndex
+
13
]
+
tempBuffer
[
bufferIndex
+
25
]
+
tempBuffer
[
bufferIndex
+
37
]
+
tempBuffer
[
bufferIndex
+
49
]
+
tempBuffer
[
bufferIndex
+
61
]
+
tempBuffer
[
bufferIndex
+
73
]
+
tempBuffer
[
bufferIndex
+
85
];
localData
[
atom2
].
force
.
z
-=
tempBuffer
[
bufferIndex
+
2
]
+
tempBuffer
[
bufferIndex
+
14
]
+
tempBuffer
[
bufferIndex
+
26
]
+
tempBuffer
[
bufferIndex
+
38
]
+
tempBuffer
[
bufferIndex
+
50
]
+
tempBuffer
[
bufferIndex
+
62
]
+
tempBuffer
[
bufferIndex
+
74
]
+
tempBuffer
[
bufferIndex
+
86
];
}
tempBuffer
[
bufferIndex
]
=
newTorque
.
x
;
tempBuffer
[
bufferIndex
+
1
]
=
newTorque
.
y
;
tempBuffer
[
bufferIndex
+
2
]
=
newTorque
.
z
;
if
(
tgx
%
4
==
0
)
{
tempBuffer
[
bufferIndex
]
+=
tempBuffer
[
bufferIndex
+
3
]
+
tempBuffer
[
bufferIndex
+
6
]
+
tempBuffer
[
bufferIndex
+
9
];
tempBuffer
[
bufferIndex
+
1
]
+=
tempBuffer
[
bufferIndex
+
4
]
+
tempBuffer
[
bufferIndex
+
7
]
+
tempBuffer
[
bufferIndex
+
10
];
tempBuffer
[
bufferIndex
+
2
]
+=
tempBuffer
[
bufferIndex
+
5
]
+
tempBuffer
[
bufferIndex
+
8
]
+
tempBuffer
[
bufferIndex
+
11
];
}
if
(
tgx
==
0
)
{
localData
[
atom2
].
torque
.
x
-=
tempBuffer
[
bufferIndex
]
+
tempBuffer
[
bufferIndex
+
12
]
+
tempBuffer
[
bufferIndex
+
24
]
+
tempBuffer
[
bufferIndex
+
36
]
+
tempBuffer
[
bufferIndex
+
48
]
+
tempBuffer
[
bufferIndex
+
60
]
+
tempBuffer
[
bufferIndex
+
72
]
+
tempBuffer
[
bufferIndex
+
84
];
localData
[
atom2
].
torque
.
y
-=
tempBuffer
[
bufferIndex
+
1
]
+
tempBuffer
[
bufferIndex
+
13
]
+
tempBuffer
[
bufferIndex
+
25
]
+
tempBuffer
[
bufferIndex
+
37
]
+
tempBuffer
[
bufferIndex
+
49
]
+
tempBuffer
[
bufferIndex
+
61
]
+
tempBuffer
[
bufferIndex
+
73
]
+
tempBuffer
[
bufferIndex
+
85
];
localData
[
atom2
].
torque
.
z
-=
tempBuffer
[
bufferIndex
+
2
]
+
tempBuffer
[
bufferIndex
+
14
]
+
tempBuffer
[
bufferIndex
+
26
]
+
tempBuffer
[
bufferIndex
+
38
]
+
tempBuffer
[
bufferIndex
+
50
]
+
tempBuffer
[
bufferIndex
+
62
]
+
tempBuffer
[
bufferIndex
+
74
]
+
tempBuffer
[
bufferIndex
+
86
];
}
#endif
}
}
}
data
.
force
*=
-
ENERGY_SCALE_FACTOR
;
data
.
torque
*=
-
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
-
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
torque
*=
-
ENERGY_SCALE_FACTOR
;
if
(
pos
<
end
)
{
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
z
*
0x100000000
)));
}
}
}
else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
{
// Compute the full set of interactions in this tile.
uint2
covalent
=
(
hasExclusions
?
covalentFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
make_uint2
(
0
,
0
));
unsigned
int
polarizationGroup
=
(
hasExclusions
?
polarizationGroupFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
0
);
// Compute forces.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
float
m
=
computeMScaleFactor
(
covalent
,
tj
);
computeOneInteraction
(
data
,
localData
[
tbx
+
tj
],
hasExclusions
,
d
,
p
,
m
,
1
,
energy
,
periodicBoxSize
,
invPeriodicBoxSize
);
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
-
ENERGY_SCALE_FACTOR
;
data
.
torque
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
-
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
torque
*=
ENERGY_SCALE_FACTOR
;
if
(
pos
<
end
)
{
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
z
*
0x100000000
)));
}
atomIndices
[
threadIdx
.
x
]
=
j
;
loadAtomData
(
localData
[
threadIdx
.
x
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
localData
[
threadIdx
.
x
].
torque
=
make_real3
(
0
);
// Compute forces.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
atomIndices
[
tbx
+
tj
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
computeOneInteraction
(
data
,
localData
[
tbx
+
tj
],
false
,
1
,
1
,
1
,
1
,
energy
,
periodicBoxSize
,
invPeriodicBoxSize
);
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
-
ENERGY_SCALE_FACTOR
;
data
.
torque
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
-
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
torque
*=
ENERGY_SCALE_FACTOR
;
// Write results.
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
z
*
0x100000000
)));
#ifdef USE_CUTOFF
offset
=
atomIndices
[
threadIdx
.
x
];
#else
offset
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
z
*
0x100000000
)));
}
pos
++
;
}
while
(
pos
<
end
);
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
*
ENERGY_SCALE_FACTOR
;
}
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment