Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
93c467b2
"src/webui/vscode:/vscode.git/clone" did not exist on "a2b2c56d93e1c65ba540fb0667e53a3f6cdc412c"
Commit
93c467b2
authored
Mar 22, 2013
by
Peter Eastman
Browse files
Merged 5.1Optimizations branch back to trunk
parent
f6d4557d
Changes
86
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
1353 additions
and
1263 deletions
+1353
-1263
plugins/amoeba/platforms/cuda/src/kernels/amoebaGk.cu
plugins/amoeba/platforms/cuda/src/kernels/amoebaGk.cu
+227
-153
plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
...eba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
+249
-295
plugins/amoeba/platforms/cuda/src/kernels/multipoleFixedField.cu
.../amoeba/platforms/cuda/src/kernels/multipoleFixedField.cu
+229
-205
plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
...moeba/platforms/cuda/src/kernels/multipoleInducedField.cu
+187
-159
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
+270
-229
plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
.../platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
+191
-222
No files found.
plugins/amoeba/platforms/cuda/src/kernels/amoebaGk.cu
View file @
93c467b2
...
@@ -606,181 +606,255 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
...
@@ -606,181 +606,255 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
*/
*/
extern
"C"
__global__
void
computeEDiffForce
(
extern
"C"
__global__
void
computeEDiffForce
(
unsigned
long
long
*
__restrict__
forceBuffers
,
unsigned
long
long
*
__restrict__
torqueBuffers
,
real
*
__restrict__
energyBuffer
,
unsigned
long
long
*
__restrict__
forceBuffers
,
unsigned
long
long
*
__restrict__
torqueBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
u
nsigned
int
*
__restrict__
exclusionIndice
s
,
const
unsigned
int
*
__restrict__
exclusionRowIndice
s
,
const
real4
*
__restrict__
posq
,
const
uint
2
*
__restrict__
covalentFlag
s
,
const
unsigned
int
*
__restrict__
polarizationGroupFlag
s
,
const
u
in
t2
*
__restrict__
covalentFlags
,
const
unsigned
int
*
__restrict__
polarizationGroupFlag
s
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
const
u
shor
t2
*
__restrict__
exclusionTile
s
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
const
real
*
__restrict__
inducedDipoleS
,
const
real
*
__restrict__
inducedDipolePolarS
,
const
real
*
__restrict__
inducedDipolePolar
,
const
real
*
__restrict__
inducedDipoleS
,
const
real
*
__restrict__
inducedDipolePolarS
,
const
float2
*
__restrict__
dampingAndThole
)
{
const
float2
*
__restrict__
dampingAndThole
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
numTiles
=
numTileIndices
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
unsigned
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
unsigned
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
real
energy
=
0
;
real
energy
=
0
;
__shared__
AtomData4
localData
[
EDIFF_THREAD_BLOCK_SIZE
];
__shared__
AtomData4
localData
[
EDIFF_THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
(
EDIFF_THREAD_BLOCK_SIZE
/
TILE_SIZE
)];
__shared__
int
exclusionIndex
[
EDIFF_THREAD_BLOCK_SIZE
/
TILE_SIZE
];
// First loop: process tiles that contain exclusions.
do
{
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
// Extract the coordinates of this tile
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
u
nsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
u
short2
tileIndices
=
exclusionTiles
[
pos
]
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
const
unsigned
int
x
=
tileIndices
.
x
;
unsigned
int
x
,
y
;
const
unsigned
int
y
=
tileIndices
.
y
;
AtomData4
data
;
AtomData4
data
;
if
(
pos
<
end
)
{
data
.
force
=
make_real3
(
0
);
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
loadAtomData4
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
inducedDipoleS
,
inducedDipolePolarS
,
dampingAndThole
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
uint2
covalent
=
covalentFlags
[
pos
*
TILE_SIZE
+
tgx
];
y
+=
(
x
<
y
?
-
1
:
1
);
unsigned
int
polarizationGroup
=
polarizationGroupFlags
[
pos
*
TILE_SIZE
+
tgx
];
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
pos
=
data
.
pos
;
localData
[
threadIdx
.
x
].
q
=
data
.
q
;
localData
[
threadIdx
.
x
].
dipole
=
data
.
dipole
;
localData
[
threadIdx
.
x
].
quadrupoleXX
=
data
.
quadrupoleXX
;
localData
[
threadIdx
.
x
].
quadrupoleXY
=
data
.
quadrupoleXY
;
localData
[
threadIdx
.
x
].
quadrupoleXZ
=
data
.
quadrupoleXZ
;
localData
[
threadIdx
.
x
].
quadrupoleYY
=
data
.
quadrupoleYY
;
localData
[
threadIdx
.
x
].
quadrupoleYZ
=
data
.
quadrupoleYZ
;
localData
[
threadIdx
.
x
].
quadrupoleZZ
=
data
.
quadrupoleZZ
;
localData
[
threadIdx
.
x
].
inducedDipole
=
data
.
inducedDipole
;
localData
[
threadIdx
.
x
].
inducedDipolePolar
=
data
.
inducedDipolePolar
;
localData
[
threadIdx
.
x
].
inducedDipoleS
=
data
.
inducedDipoleS
;
localData
[
threadIdx
.
x
].
inducedDipolePolarS
=
data
.
inducedDipolePolarS
;
localData
[
threadIdx
.
x
].
thole
=
data
.
thole
;
localData
[
threadIdx
.
x
].
damp
=
data
.
damp
;
// Compute forces.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
real
tempEnergy
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
computeOneEDiffInteractionF1
(
data
,
localData
[
tbx
+
j
],
d
,
p
,
tempEnergy
,
tempForce
);
energy
+=
0.25
f
*
tempEnergy
;
data
.
force
+=
tempForce
;
}
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
data
.
force
*=
ENERGY_SCALE_FACTOR
;
loadAtomData4
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
inducedDipoleS
,
inducedDipolePolarS
,
dampingAndThole
);
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
data
.
force
=
make_real3
(
0
);
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
// Locate the exclusion data for this tile.
if
(
tgx
<
2
)
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex
[
localGroupIndex
]
>
-
1
);
if
(
pos
>=
end
)
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
pos
=
data
.
pos
;
// Compute torques.
localData
[
threadIdx
.
x
].
q
=
data
.
q
;
localData
[
threadIdx
.
x
].
dipole
=
data
.
dipole
;
data
.
force
=
make_real3
(
0
);
localData
[
threadIdx
.
x
].
quadrupoleXX
=
data
.
quadrupoleXX
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
localData
[
threadIdx
.
x
].
quadrupoleXY
=
data
.
quadrupoleXY
;
int
atom2
=
y
*
TILE_SIZE
+
j
;
localData
[
threadIdx
.
x
].
quadrupoleXZ
=
data
.
quadrupoleXZ
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
localData
[
threadIdx
.
x
].
quadrupoleYY
=
data
.
quadrupoleYY
;
real3
tempTorque
;
localData
[
threadIdx
.
x
].
quadrupoleYZ
=
data
.
quadrupoleYZ
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
localData
[
threadIdx
.
x
].
quadrupoleZZ
=
data
.
quadrupoleZZ
;
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
localData
[
threadIdx
.
x
].
inducedDipole
=
data
.
inducedDipole
;
computeOneEDiffInteractionT1
(
data
,
localData
[
tbx
+
j
],
d
,
p
,
tempTorque
);
localData
[
threadIdx
.
x
].
inducedDipolePolar
=
data
.
inducedDipolePolar
;
data
.
force
+=
tempTorque
;
localData
[
threadIdx
.
x
].
inducedDipoleS
=
data
.
inducedDipoleS
;
localData
[
threadIdx
.
x
].
inducedDipolePolarS
=
data
.
inducedDipolePolarS
;
localData
[
threadIdx
.
x
].
thole
=
data
.
thole
;
localData
[
threadIdx
.
x
].
damp
=
data
.
damp
;
uint2
covalent
=
covalentFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
unsigned
int
polarizationGroup
=
polarizationGroupFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
// Compute forces.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
real
tempEnergy
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
computeOneEDiffInteractionF1
(
data
,
localData
[
tbx
+
j
],
d
,
p
,
tempEnergy
,
tempForce
);
energy
+=
0.25
f
*
tempEnergy
;
data
.
force
+=
tempForce
;
}
}
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
}
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
data
.
force
*=
ENERGY_SCALE_FACTOR
;
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
}
else
{
// This is an off-diagonal tile.
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
loadAtomData4
(
localData
[
threadIdx
.
x
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
inducedDipoleS
,
inducedDipolePolarS
,
dampingAndThole
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
// Compute forces.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
real
tempEnergy
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
computeOneEDiffInteractionF1
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
tempEnergy
,
tempForce
);
energy
+=
0.5
f
*
tempEnergy
;
data
.
force
+=
tempForce
;
localData
[
tbx
+
tj
].
force
-=
tempForce
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
// Compute torques.
data
.
force
=
make_real3
(
0
);
// Compute torques.
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
data
.
force
=
make_real3
(
0
);
int
atom2
=
y
*
TILE_SIZE
+
tj
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
real3
tempTorque
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
real3
tempTorque
;
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
computeOneEDiffInteractionT1
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
tempTorque
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
data
.
force
+=
tempTorque
;
computeOneEDiffInteractionT1
(
data
,
localData
[
tbx
+
j
],
d
,
p
,
tempTorque
);
computeOneEDiffInteractionT3
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
tempTorque
);
data
.
force
+=
tempTorque
;
localData
[
tbx
+
tj
].
force
+=
tempTorque
;
}
}
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
atomicAdd
(
&
torqueBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
}
}
else
{
data
.
force
*=
ENERGY_SCALE_FACTOR
;
// This is an off-diagonal tile.
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
}
}
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
// Second loop: tiles without exclusions (by enumerating all of them, since there's no cutoff).
loadAtomData4
(
localData
[
threadIdx
.
x
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
inducedDipoleS
,
inducedDipolePolarS
,
dampingAndThole
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
uint2
covalent
=
(
hasExclusions
?
covalentFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
make_uint2
(
0
,
0
));
unsigned
int
polarizationGroup
=
(
hasExclusions
?
polarizationGroupFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
0
);
// Compute forces.
const
unsigned
int
numTiles
=
numTileIndices
;
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
skipTiles
[
EDIFF_THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
unsigned
int
tj
=
tgx
;
while
(
pos
<
end
)
{
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
// Extract the coordinates of this tile.
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
real
tempEnergy
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
computeOneEDiffInteractionF1
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
tempEnergy
,
tempForce
);
energy
+=
0.5
f
*
tempEnergy
;
data
.
force
+=
tempForce
;
localData
[
tbx
+
tj
].
force
-=
tempForce
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
if
(
pos
<
end
)
{
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
}
// Compute torques.
unsigned
int
x
,
y
;
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
data
.
force
=
make_real3
(
0
);
// Skip over tiles that have exclusions, since they were already processed.
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
real3
tempTorque
;
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
}
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
else
computeOneEDiffInteractionT1
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
tempTorque
);
skipTiles
[
threadIdx
.
x
]
=
end
;
data
.
force
+=
tempTorque
;
skipBase
+=
TILE_SIZE
;
computeOneEDiffInteractionT3
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
tempTorque
);
currentSkipIndex
=
tbx
;
localData
[
tbx
+
tj
].
force
+=
tempTorque
;
}
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
currentSkipIndex
++
;
bool
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
AtomData4
data
;
data
.
force
=
make_real3
(
0
);
loadAtomData4
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
inducedDipoleS
,
inducedDipolePolarS
,
dampingAndThole
);
loadAtomData4
(
localData
[
threadIdx
.
x
],
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
inducedDipoleS
,
inducedDipolePolarS
,
dampingAndThole
);
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
loadAtomData4
(
localData
[
threadIdx
.
x
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
inducedDipoleS
,
inducedDipolePolarS
,
dampingAndThole
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
// Compute forces.
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
real
tempEnergy
;
computeOneEDiffInteractionF1
(
data
,
localData
[
tbx
+
tj
],
1
,
1
,
tempEnergy
,
tempForce
);
energy
+=
0.5
f
*
tempEnergy
;
data
.
force
+=
tempForce
;
localData
[
tbx
+
tj
].
force
-=
tempForce
;
}
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
}
if
(
pos
<
end
)
{
data
.
force
*=
ENERGY_SCALE_FACTOR
;
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
// Compute torques.
data
.
force
=
make_real3
(
0
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempTorque
;
computeOneEDiffInteractionT1
(
data
,
localData
[
tbx
+
tj
],
1
,
1
,
tempTorque
);
data
.
force
+=
tempTorque
;
computeOneEDiffInteractionT3
(
data
,
localData
[
tbx
+
tj
],
1
,
1
,
tempTorque
);
localData
[
tbx
+
tj
].
force
+=
tempTorque
;
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
}
}
pos
++
;
pos
++
;
}
while
(
pos
<
end
);
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
*
ENERGY_SCALE_FACTOR
;
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
*
ENERGY_SCALE_FACTOR
;
}
}
plugins/amoeba/platforms/cuda/src/kernels/multipoleElectrostatics.cu
View file @
93c467b2
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef
struct
{
typedef
struct
{
...
@@ -59,331 +58,286 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
...
@@ -59,331 +58,286 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
*/
*/
extern
"C"
__global__
void
computeElectrostatics
(
extern
"C"
__global__
void
computeElectrostatics
(
unsigned
long
long
*
__restrict__
forceBuffers
,
unsigned
long
long
*
__restrict__
torqueBuffers
,
real
*
__restrict__
energyBuffer
,
unsigned
long
long
*
__restrict__
forceBuffers
,
unsigned
long
long
*
__restrict__
torqueBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
u
nsigned
int
*
__restrict__
exclusionIndice
s
,
const
unsigned
int
*
__restrict__
exclusionRowIndice
s
,
const
real4
*
__restrict__
posq
,
const
uint
2
*
__restrict__
covalentFlag
s
,
const
unsigned
int
*
__restrict__
polarizationGroupFlag
s
,
const
u
in
t2
*
__restrict__
covalentFlags
,
const
unsigned
int
*
__restrict__
polarizationGroupFlag
s
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
const
u
shor
t2
*
__restrict__
exclusionTile
s
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
,
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
,
#endif
#endif
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
const
float2
*
__restrict__
dampingAndThole
)
{
const
real
*
__restrict__
inducedDipolePolar
,
const
float2
*
__restrict__
dampingAndThole
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
real
energy
=
0
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
AtomData
data
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
loadAtomData
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
data
.
force
=
make_real3
(
0
);
uint2
covalent
=
covalentFlags
[
pos
*
TILE_SIZE
+
tgx
];
unsigned
int
polarizationGroup
=
polarizationGroupFlags
[
pos
*
TILE_SIZE
+
tgx
];
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
posq
=
data
.
posq
;
localData
[
threadIdx
.
x
].
dipole
=
data
.
dipole
;
localData
[
threadIdx
.
x
].
quadrupoleXX
=
data
.
quadrupoleXX
;
localData
[
threadIdx
.
x
].
quadrupoleXY
=
data
.
quadrupoleXY
;
localData
[
threadIdx
.
x
].
quadrupoleXZ
=
data
.
quadrupoleXZ
;
localData
[
threadIdx
.
x
].
quadrupoleYY
=
data
.
quadrupoleYY
;
localData
[
threadIdx
.
x
].
quadrupoleYZ
=
data
.
quadrupoleYZ
;
localData
[
threadIdx
.
x
].
inducedDipole
=
data
.
inducedDipole
;
localData
[
threadIdx
.
x
].
inducedDipolePolar
=
data
.
inducedDipolePolar
;
localData
[
threadIdx
.
x
].
thole
=
data
.
thole
;
localData
[
threadIdx
.
x
].
damp
=
data
.
damp
;
// Compute forces.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
real
tempEnergy
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
float
m
=
computeMScaleFactor
(
covalent
,
j
);
computeOneInteractionF1
(
data
,
localData
[
tbx
+
j
],
d
,
p
,
m
,
tempEnergy
,
tempForce
);
data
.
force
+=
tempForce
;
energy
+=
0.5
f
*
tempEnergy
;
}
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
// Compute torques.
data
.
force
=
make_real3
(
0
);
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
float
m
=
computeMScaleFactor
(
covalent
,
j
);
computeOneInteractionT1
(
data
,
localData
[
tbx
+
j
],
d
,
p
,
m
,
tempForce
);
data
.
force
+=
tempForce
;
}
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
atomicAdd
(
&
torqueBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
}
else
{
// This is an off-diagonal tile.
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
loadAtomData
(
localData
[
threadIdx
.
x
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
real
tempEnergy
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
float
m
=
computeMScaleFactor
(
covalent
,
tj
);
computeOneInteractionF1
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
m
,
tempEnergy
,
tempForce
);
data
.
force
+=
tempForce
;
localData
[
tbx
+
tj
].
force
-=
tempForce
;
energy
+=
tempEnergy
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
// Compute torques.
data
.
force
=
make_real3
(
0
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
float
m
=
computeMScaleFactor
(
covalent
,
tj
);
computeOneInteractionT1
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
m
,
tempForce
);
data
.
force
+=
tempForce
;
computeOneInteractionT3
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
m
,
tempForce
);
localData
[
tbx
+
tj
].
force
+=
tempForce
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
const
unsigned
int
numTiles
=
interactionCount
[
0
];
const
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
unsigned
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
warp
+
1
)
*
numTileIndices
/
totalWarps
:
(
warp
+
1
)
*
numTiles
/
totalWarps
);
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
warp
+
1
)
*
numTileIndices
/
totalWarps
:
(
warp
+
1
)
*
numTiles
/
totalWarps
);
#else
#else
const
unsigned
int
numTiles
=
numTileIndices
;
const
unsigned
int
numTiles
=
numTileIndices
;
unsigned
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
real
energy
=
0
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
#ifndef ENABLE_SHUFFLE
__shared__
real
tempBuffer
[
3
*
THREAD_BLOCK_SIZE
];
#endif
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
do
{
while
(
pos
<
end
)
{
// Extract the coordinates of this tile
bool
includeTile
=
true
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
// Extract the coordinates of this tile.
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
unsigned
int
x
,
y
;
AtomData
data
;
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
}
else
else
#endif
#endif
{
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
}
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
AtomData
data
;
loadAtomData
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
loadAtomData
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
data
.
force
=
make_real3
(
0
);
data
.
force
=
make_real3
(
0
);
#ifdef USE_CUTOFF
// Locate the exclusion data for this tile.
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
loadAtomData
(
localData
[
threadIdx
.
x
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
if
(
tgx
<
2
)
// Compute forces.
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex
[
localGroupIndex
]
>
-
1
);
if
(
pos
>=
end
)
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
posq
=
data
.
posq
;
unsigned
int
tj
=
tgx
;
localData
[
threadIdx
.
x
].
dipole
=
data
.
dipole
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
localData
[
threadIdx
.
x
].
quadrupoleXX
=
data
.
quadrupoleXX
;
int
atom2
=
atomIndices
[
tbx
+
tj
];
localData
[
threadIdx
.
x
].
quadrupoleXY
=
data
.
quadrupoleXY
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
localData
[
threadIdx
.
x
].
quadrupoleXZ
=
data
.
quadrupoleXZ
;
real3
tempForce
;
localData
[
threadIdx
.
x
].
quadrupoleYY
=
data
.
quadrupoleYY
;
real
tempEnergy
;
localData
[
threadIdx
.
x
].
quadrupoleYZ
=
data
.
quadrupoleYZ
;
computeOneInteractionF1
(
data
,
localData
[
tbx
+
tj
],
1
,
1
,
1
,
tempEnergy
,
tempForce
);
localData
[
threadIdx
.
x
].
inducedDipole
=
data
.
inducedDipole
;
data
.
force
+=
tempForce
;
localData
[
threadIdx
.
x
].
inducedDipolePolar
=
data
.
inducedDipolePolar
;
localData
[
tbx
+
tj
].
force
-=
tempForce
;
localData
[
threadIdx
.
x
].
thole
=
data
.
thole
;
energy
+=
tempEnergy
;
localData
[
threadIdx
.
x
].
damp
=
data
.
damp
;
uint2
covalent
=
covalentFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
unsigned
int
polarizationGroup
=
polarizationGroupFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
// Compute forces.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
real
tempEnergy
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
float
m
=
computeMScaleFactor
(
covalent
,
j
);
computeOneInteractionF1
(
data
,
localData
[
tbx
+
j
],
d
,
p
,
m
,
tempEnergy
,
tempForce
);
data
.
force
+=
tempForce
;
energy
+=
0.5
f
*
tempEnergy
;
}
}
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
// Compute torques.
data
.
force
=
make_real3
(
0
);
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
float
m
=
computeMScaleFactor
(
covalent
,
j
);
computeOneInteractionT1
(
data
,
localData
[
tbx
+
j
],
d
,
p
,
m
,
tempForce
);
data
.
force
+=
tempForce
;
}
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
atomicAdd
(
&
torqueBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
}
}
else
{
data
.
force
*=
ENERGY_SCALE_FACTOR
;
// This is an off-diagonal tile.
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)))
;
loadAtomData
(
localData
[
threadIdx
.
x
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
))
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
))
);
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
offset
=
atomIndices
[
threadIdx
.
x
];
if
(
!
hasExclusions
&&
flags
!=
0xFFFFFFFF
)
{
if
(
flags
==
0
)
{
// No interactions in this tile.
}
else
{
// Compute only a subset of the interactions in this tile.
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
int
atom2
=
tbx
+
j
;
real3
delta
=
make_real3
(
localData
[
atom2
].
posq
.
x
-
data
.
posq
.
x
,
localData
[
atom2
].
posq
.
y
-
data
.
posq
.
y
,
localData
[
atom2
].
posq
.
z
-
data
.
posq
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real3
tempForce
;
real
tempEnergy
;
computeOneInteractionF1
(
data
,
localData
[
atom2
],
1
,
1
,
1
,
tempEnergy
,
tempForce
);
data
.
force
+=
tempForce
;
localData
[
atom2
].
force
-=
tempForce
;
energy
+=
tempEnergy
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#ifdef ENABLE_SHUFFLE
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
{
tempForce
.
x
+=
__shfl_xor
(
tempForce
.
x
,
i
,
32
);
tempForce
.
y
+=
__shfl_xor
(
tempForce
.
y
,
i
,
32
);
tempForce
.
z
+=
__shfl_xor
(
tempForce
.
z
,
i
,
32
);
}
if
(
tgx
==
0
)
localData
[
atom2
].
force
-=
tempForce
;
#else
#else
int
bufferIndex
=
3
*
threadIdx
.
x
;
offset
=
y
*
TILE_SIZE
+
tgx
;
tempBuffer
[
bufferIndex
]
=
tempForce
.
x
;
tempBuffer
[
bufferIndex
+
1
]
=
tempForce
.
y
;
tempBuffer
[
bufferIndex
+
2
]
=
tempForce
.
z
;
if
(
tgx
%
4
==
0
)
{
tempBuffer
[
bufferIndex
]
+=
tempBuffer
[
bufferIndex
+
3
]
+
tempBuffer
[
bufferIndex
+
6
]
+
tempBuffer
[
bufferIndex
+
9
];
tempBuffer
[
bufferIndex
+
1
]
+=
tempBuffer
[
bufferIndex
+
4
]
+
tempBuffer
[
bufferIndex
+
7
]
+
tempBuffer
[
bufferIndex
+
10
];
tempBuffer
[
bufferIndex
+
2
]
+=
tempBuffer
[
bufferIndex
+
5
]
+
tempBuffer
[
bufferIndex
+
8
]
+
tempBuffer
[
bufferIndex
+
11
];
}
if
(
tgx
==
0
)
{
localData
[
atom2
].
force
.
x
-=
tempBuffer
[
bufferIndex
]
+
tempBuffer
[
bufferIndex
+
12
]
+
tempBuffer
[
bufferIndex
+
24
]
+
tempBuffer
[
bufferIndex
+
36
]
+
tempBuffer
[
bufferIndex
+
48
]
+
tempBuffer
[
bufferIndex
+
60
]
+
tempBuffer
[
bufferIndex
+
72
]
+
tempBuffer
[
bufferIndex
+
84
];
localData
[
atom2
].
force
.
y
-=
tempBuffer
[
bufferIndex
+
1
]
+
tempBuffer
[
bufferIndex
+
13
]
+
tempBuffer
[
bufferIndex
+
25
]
+
tempBuffer
[
bufferIndex
+
37
]
+
tempBuffer
[
bufferIndex
+
49
]
+
tempBuffer
[
bufferIndex
+
61
]
+
tempBuffer
[
bufferIndex
+
73
]
+
tempBuffer
[
bufferIndex
+
85
];
localData
[
atom2
].
force
.
z
-=
tempBuffer
[
bufferIndex
+
2
]
+
tempBuffer
[
bufferIndex
+
14
]
+
tempBuffer
[
bufferIndex
+
26
]
+
tempBuffer
[
bufferIndex
+
38
]
+
tempBuffer
[
bufferIndex
+
50
]
+
tempBuffer
[
bufferIndex
+
62
]
+
tempBuffer
[
bufferIndex
+
74
]
+
tempBuffer
[
bufferIndex
+
86
];
}
#endif
#endif
}
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
}
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
}
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
if
(
pos
<
end
)
{
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
}
// Compute torques.
data
.
force
=
make_real3
(
0
);
// Compute torques.
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
int
atom2
=
tbx
+
j
;
real3
delta
=
make_real3
(
localData
[
atom2
].
posq
.
x
-
data
.
posq
.
x
,
localData
[
atom2
].
posq
.
y
-
data
.
posq
.
y
,
localData
[
atom2
].
posq
.
z
-
data
.
posq
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real3
tempForce
;
computeOneInteractionT1
(
data
,
localData
[
atom2
],
1
,
1
,
1
,
tempForce
);
data
.
force
+=
tempForce
;
computeOneInteractionT3
(
data
,
localData
[
atom2
],
1
,
1
,
1
,
tempForce
);
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#ifdef ENABLE_SHUFFLE
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
{
tempForce
.
x
+=
__shfl_xor
(
tempForce
.
x
,
i
,
32
);
tempForce
.
y
+=
__shfl_xor
(
tempForce
.
y
,
i
,
32
);
tempForce
.
z
+=
__shfl_xor
(
tempForce
.
z
,
i
,
32
);
}
if
(
tgx
==
0
)
localData
[
atom2
].
force
-=
tempForce
;
#else
int
bufferIndex
=
3
*
threadIdx
.
x
;
tempBuffer
[
bufferIndex
]
=
tempForce
.
x
;
tempBuffer
[
bufferIndex
+
1
]
=
tempForce
.
y
;
tempBuffer
[
bufferIndex
+
2
]
=
tempForce
.
z
;
if
(
tgx
%
4
==
0
)
{
tempBuffer
[
bufferIndex
]
+=
tempBuffer
[
bufferIndex
+
3
]
+
tempBuffer
[
bufferIndex
+
6
]
+
tempBuffer
[
bufferIndex
+
9
];
tempBuffer
[
bufferIndex
+
1
]
+=
tempBuffer
[
bufferIndex
+
4
]
+
tempBuffer
[
bufferIndex
+
7
]
+
tempBuffer
[
bufferIndex
+
10
];
tempBuffer
[
bufferIndex
+
2
]
+=
tempBuffer
[
bufferIndex
+
5
]
+
tempBuffer
[
bufferIndex
+
8
]
+
tempBuffer
[
bufferIndex
+
11
];
}
if
(
tgx
==
0
)
{
localData
[
atom2
].
force
.
x
+=
tempBuffer
[
bufferIndex
]
+
tempBuffer
[
bufferIndex
+
12
]
+
tempBuffer
[
bufferIndex
+
24
]
+
tempBuffer
[
bufferIndex
+
36
]
+
tempBuffer
[
bufferIndex
+
48
]
+
tempBuffer
[
bufferIndex
+
60
]
+
tempBuffer
[
bufferIndex
+
72
]
+
tempBuffer
[
bufferIndex
+
84
];
localData
[
atom2
].
force
.
y
+=
tempBuffer
[
bufferIndex
+
1
]
+
tempBuffer
[
bufferIndex
+
13
]
+
tempBuffer
[
bufferIndex
+
25
]
+
tempBuffer
[
bufferIndex
+
37
]
+
tempBuffer
[
bufferIndex
+
49
]
+
tempBuffer
[
bufferIndex
+
61
]
+
tempBuffer
[
bufferIndex
+
73
]
+
tempBuffer
[
bufferIndex
+
85
];
localData
[
atom2
].
force
.
z
+=
tempBuffer
[
bufferIndex
+
2
]
+
tempBuffer
[
bufferIndex
+
14
]
+
tempBuffer
[
bufferIndex
+
26
]
+
tempBuffer
[
bufferIndex
+
38
]
+
tempBuffer
[
bufferIndex
+
50
]
+
tempBuffer
[
bufferIndex
+
62
]
+
tempBuffer
[
bufferIndex
+
74
]
+
tempBuffer
[
bufferIndex
+
86
];
}
#endif
}
}
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
if
(
pos
<
end
)
{
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
uint2
covalent
=
(
hasExclusions
?
covalentFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
make_uint2
(
0
,
0
));
data
.
force
=
make_real3
(
0
);
unsigned
int
polarizationGroup
=
(
hasExclusions
?
polarizationGroupFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
0
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
// Compute forces.
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
unsigned
int
tj
=
tgx
;
real3
tempForce
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
computeOneInteractionT1
(
data
,
localData
[
tbx
+
tj
],
1
,
1
,
1
,
tempForce
);
int
atom2
=
y
*
TILE_SIZE
+
tj
;
data
.
force
+=
tempForce
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
computeOneInteractionT3
(
data
,
localData
[
tbx
+
tj
],
1
,
1
,
1
,
tempForce
);
real3
tempForce
;
localData
[
tbx
+
tj
].
force
+=
tempForce
;
real
tempEnergy
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
float
m
=
computeMScaleFactor
(
covalent
,
tj
);
computeOneInteractionF1
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
m
,
tempEnergy
,
tempForce
);
data
.
force
+=
tempForce
;
localData
[
tbx
+
tj
].
force
-=
tempForce
;
energy
+=
tempEnergy
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
if
(
pos
<
end
)
{
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
}
// Compute torques.
data
.
force
=
make_real3
(
0
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
tempForce
;
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
float
m
=
computeMScaleFactor
(
covalent
,
tj
);
computeOneInteractionT1
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
m
,
tempForce
);
data
.
force
+=
tempForce
;
computeOneInteractionT3
(
data
,
localData
[
tbx
+
tj
],
d
,
p
,
m
,
tempForce
);
localData
[
tbx
+
tj
].
force
+=
tempForce
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
if
(
pos
<
end
)
{
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
}
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
data
.
force
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
ENERGY_SCALE_FACTOR
;
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
#ifdef USE_CUTOFF
offset
=
atomIndices
[
threadIdx
.
x
];
#else
offset
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
}
}
pos
++
;
pos
++
;
}
while
(
pos
<
end
);
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
*
ENERGY_SCALE_FACTOR
;
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
*
ENERGY_SCALE_FACTOR
;
}
}
plugins/amoeba/platforms/cuda/src/kernels/multipoleFixedField.cu
View file @
93c467b2
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef
struct
{
typedef
struct
{
...
@@ -398,245 +397,268 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
...
@@ -398,245 +397,268 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
*/
*/
extern
"C"
__global__
void
computeFixedField
(
extern
"C"
__global__
void
computeFixedField
(
unsigned
long
long
*
__restrict__
fieldBuffers
,
unsigned
long
long
*
__restrict__
fieldPolarBuffers
,
const
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
fieldBuffers
,
unsigned
long
long
*
__restrict__
fieldPolarBuffers
,
const
real4
*
__restrict__
posq
,
const
u
nsigned
int
*
__restrict__
exclusionIndices
,
const
unsigned
int
*
__restrict__
exclusion
RowIndic
es
,
const
u
int2
*
__restrict__
covalentFlags
,
const
unsigned
int
*
__restrict__
polarizationGroupFlags
,
const
ushort2
*
__restrict__
exclusion
Til
es
,
const
uint2
*
__restrict__
covalentFlags
,
const
unsigned
int
*
__restrict__
polarizationGroupFlags
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
,
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
,
#elif defined USE_GK
#elif defined USE_GK
const
real
*
__restrict__
bornRadii
,
unsigned
long
long
*
__restrict__
gkFieldBuffers
,
const
real
*
__restrict__
bornRadii
,
unsigned
long
long
*
__restrict__
gkFieldBuffers
,
#endif
#endif
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
const
float2
*
__restrict__
dampingAndThole
)
{
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
const
float2
*
__restrict__
dampingAndThole
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
numTiles
=
interactionCount
[
0
];
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
unsigned
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
warp
+
1
)
*
numTileIndices
/
totalWarps
:
(
warp
+
1
)
*
numTiles
/
totalWarps
);
#else
const
unsigned
int
numTiles
=
numTileIndices
;
unsigned
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
// First loop: process tiles that contain exclusions.
#ifndef ENABLE_SHUFFLE
__shared__
real
tempBuffer
[
3
*
THREAD_BLOCK_SIZE
];
#endif
do
{
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
// Extract the coordinates of this tile
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
u
nsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
u
short2
tileIndices
=
exclusionTiles
[
pos
]
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
const
unsigned
int
x
=
tileIndices
.
x
;
unsigned
int
x
,
y
;
const
unsigned
int
y
=
tileIndices
.
y
;
AtomData
data
;
AtomData
data
;
data
.
field
=
make_real3
(
0
);
data
.
field
=
make_real3
(
0
);
data
.
fieldPolar
=
make_real3
(
0
);
data
.
fieldPolar
=
make_real3
(
0
);
#ifdef USE_GK
#ifdef USE_GK
data
.
gkField
=
make_real3
(
0
);
data
.
gkField
=
make_real3
(
0
);
#endif
#endif
if
(
pos
<
end
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
#ifdef USE_CUTOFF
loadAtomData
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
dampingAndThole
);
if
(
numTiles
<=
maxTiles
)
{
#ifdef USE_GK
ushort2
tileIndices
=
tiles
[
pos
];
data
.
bornRadius
=
bornRadii
[
atom1
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
#endif
#endif
{
uint2
covalent
=
covalentFlags
[
pos
*
TILE_SIZE
+
tgx
];
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
unsigned
int
polarizationGroup
=
polarizationGroupFlags
[
pos
*
TILE_SIZE
+
tgx
];
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
==
y
)
{
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
// This tile is on the diagonal.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
posq
=
data
.
posq
;
localData
[
localAtomIndex
].
dipole
=
data
.
dipole
;
localData
[
localAtomIndex
].
quadrupoleXX
=
data
.
quadrupoleXX
;
localData
[
localAtomIndex
].
quadrupoleXY
=
data
.
quadrupoleXY
;
localData
[
localAtomIndex
].
quadrupoleXZ
=
data
.
quadrupoleXZ
;
localData
[
localAtomIndex
].
quadrupoleYY
=
data
.
quadrupoleYY
;
localData
[
localAtomIndex
].
quadrupoleYZ
=
data
.
quadrupoleYZ
;
localData
[
localAtomIndex
].
quadrupoleZZ
=
data
.
quadrupoleZZ
;
localData
[
localAtomIndex
].
thole
=
data
.
thole
;
localData
[
localAtomIndex
].
damp
=
data
.
damp
;
#ifdef USE_GK
localData
[
localAtomIndex
].
bornRadius
=
data
.
bornRadius
;
#endif
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
trimTo3
(
localData
[
tbx
+
j
].
posq
-
data
.
posq
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
fields
[
4
];
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
computeOneInteraction
(
data
,
localData
[
tbx
+
j
],
delta
,
d
,
p
,
fields
);
data
.
field
+=
fields
[
0
];
data
.
fieldPolar
+=
fields
[
1
];
}
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
loadAtomData
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
dampingAndThole
);
#ifdef USE_GK
#ifdef USE_GK
data
.
bornRadius
=
bornRadii
[
atom1
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
fields
[
2
];
computeOneGkInteraction
(
data
,
localData
[
tbx
+
j
],
delta
,
fields
);
data
.
gkField
+=
fields
[
0
];
}
#endif
#endif
}
// Locate the exclusion data for this tile.
}
else
{
if
(
tgx
<
2
)
// This is an off-diagonal tile.
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
exclusionIndex
[
localGroupIndex
]
=
-
1
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
loadAtomData
(
localData
[
localAtomIndex
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
dampingAndThole
);
if
(
exclusionIndices
[
i
]
==
y
)
localData
[
localAtomIndex
].
field
=
make_real3
(
0
);
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
localData
[
localAtomIndex
].
fieldPolar
=
make_real3
(
0
);
bool
hasExclusions
=
(
exclusionIndex
[
localGroupIndex
]
>
-
1
);
if
(
pos
>=
end
)
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
posq
=
data
.
posq
;
localData
[
localAtomIndex
].
dipole
=
data
.
dipole
;
localData
[
localAtomIndex
].
quadrupoleXX
=
data
.
quadrupoleXX
;
localData
[
localAtomIndex
].
quadrupoleXY
=
data
.
quadrupoleXY
;
localData
[
localAtomIndex
].
quadrupoleXZ
=
data
.
quadrupoleXZ
;
localData
[
localAtomIndex
].
quadrupoleYY
=
data
.
quadrupoleYY
;
localData
[
localAtomIndex
].
quadrupoleYZ
=
data
.
quadrupoleYZ
;
localData
[
localAtomIndex
].
quadrupoleZZ
=
data
.
quadrupoleZZ
;
localData
[
localAtomIndex
].
thole
=
data
.
thole
;
localData
[
localAtomIndex
].
damp
=
data
.
damp
;
#ifdef USE_GK
#ifdef USE_GK
localData
[
localAtomIndex
].
bornRadius
=
data
.
bornRadius
;
localData
[
localAtomIndex
].
bornRadius
=
bornRadii
[
j
];
localData
[
localAtomIndex
].
gkField
=
make_real3
(
0
);
#endif
#endif
uint2
covalent
=
covalentFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
unsigned
int
tj
=
tgx
;
unsigned
int
polarizationGroup
=
polarizationGroupFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
trimTo3
(
localData
[
tbx
+
tj
].
posq
-
data
.
posq
);
real3
delta
=
trimTo3
(
localData
[
tbx
+
j
].
posq
-
data
.
posq
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
#endif
int
atom2
=
y
*
TILE_SIZE
+
j
;
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
fields
[
4
];
real3
fields
[
4
];
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
computeOneInteraction
(
data
,
localData
[
tbx
+
j
],
delta
,
d
,
p
,
fields
);
computeOneInteraction
(
data
,
localData
[
tbx
+
tj
],
delta
,
d
,
p
,
fields
);
data
.
field
+=
fields
[
0
];
data
.
field
+=
fields
[
0
];
data
.
fieldPolar
+=
fields
[
1
];
data
.
fieldPolar
+=
fields
[
1
];
}
localData
[
tbx
+
tj
].
field
+=
fields
[
2
];
localData
[
tbx
+
tj
].
fieldPolar
+=
fields
[
3
];
#ifdef USE_GK
#ifdef USE_GK
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
computeOneGkInteraction
(
data
,
localData
[
tbx
+
tj
],
delta
,
fields
);
real3
fields
[
2
];
data
.
gkField
+=
fields
[
0
];
computeOneGkInteraction
(
data
,
localData
[
tbx
+
j
],
delta
,
fields
);
localData
[
tbx
+
tj
].
gkField
+=
fields
[
1
];
data
.
gkField
+=
fields
[
0
];
}
#endif
#endif
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
else
{
}
// This is an off-diagonal tile.
// Write results.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
loadAtomData
(
localData
[
localAtomIndex
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
dampingAndThole
);
atomicAdd
(
&
fieldBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
x
*
0x100000000
)));
localData
[
localAtomIndex
].
field
=
make_real3
(
0
);
atomicAdd
(
&
fieldBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
y
*
0x100000000
)));
localData
[
localAtomIndex
].
fieldPolar
=
make_real3
(
0
);
atomicAdd
(
&
fieldBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
z
*
0x100000000
)));
atomicAdd
(
&
fieldPolarBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolar
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldPolarBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolar
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldPolarBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolar
.
z
*
0x100000000
)));
#ifdef USE_GK
#ifdef USE_GK
localData
[
localAtomIndex
].
bornRadius
=
bornRadii
[
j
];
atomicAdd
(
&
gkFieldBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
gkField
.
x
*
0x100000000
)));
localData
[
localAtomIndex
].
gkField
=
make_real3
(
0
);
atomicAdd
(
&
gkFieldBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
gkField
.
y
*
0x100000000
)));
atomicAdd
(
&
gkFieldBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
gkField
.
z
*
0x100000000
)));
#endif
#endif
#ifdef USE_CUTOFF
if
(
x
!=
y
)
{
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
offset
=
y
*
TILE_SIZE
+
tgx
;
if
(
!
hasExclusions
&&
flags
==
0
)
{
// TODO: Why doesn't the flags != 0 block work?
atomicAdd
(
&
fieldBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
x
*
0x100000000
)));
// if (!hasExclusions && flags != 0xFFFFFFFF) {
atomicAdd
(
&
fieldBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
y
*
0x100000000
)));
if
(
flags
==
0
)
{
atomicAdd
(
&
fieldBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
z
*
0x100000000
)));
// No interactions in this tile.
atomicAdd
(
&
fieldPolarBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldPolar
.
x
*
0x100000000
)));
}
atomicAdd
(
&
fieldPolarBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldPolar
.
y
*
0x100000000
)));
else
{
atomicAdd
(
&
fieldPolarBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldPolar
.
z
*
0x100000000
)));
// Compute only a subset of the interactions in this tile.
#ifdef USE_GK
atomicAdd
(
&
gkFieldBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
gkField
.
x
*
0x100000000
)));
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
atomicAdd
(
&
gkFieldBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
gkField
.
y
*
0x100000000
)));
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
atomicAdd
(
&
gkFieldBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
gkField
.
z
*
0x100000000
)));
int
atom2
=
tbx
+
j
;
real3
delta
=
make_real3
(
localData
[
atom2
].
posq
.
x
-
data
.
posq
.
x
,
localData
[
atom2
].
posq
.
y
-
data
.
posq
.
y
,
localData
[
atom2
].
posq
.
z
-
data
.
posq
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
#endif
real3
fields
[
4
];
}
computeOneInteraction
(
data
,
localData
[
atom2
],
delta
,
1
,
1
,
fields
);
}
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#ifdef ENABLE_SHUFFLE
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
{
// of them (no cutoff).
fields
[
2
].
x
+=
__shfl_xor
(
fields
[
2
].
x
,
i
,
32
);
fields
[
2
].
y
+=
__shfl_xor
(
fields
[
2
].
y
,
i
,
32
);
#ifdef USE_CUTOFF
fields
[
2
].
z
+=
__shfl_xor
(
fields
[
2
].
z
,
i
,
32
);
const
unsigned
int
numTiles
=
interactionCount
[
0
];
fields
[
3
].
x
+=
__shfl_xor
(
fields
[
3
].
x
,
i
,
32
);
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
fields
[
3
].
y
+=
__shfl_xor
(
fields
[
3
].
y
,
i
,
32
);
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
warp
+
1
)
*
numTileIndices
/
totalWarps
:
(
warp
+
1
)
*
numTiles
/
totalWarps
);
fields
[
3
].
z
+=
__shfl_xor
(
fields
[
3
].
z
,
i
,
32
);
}
if
(
tgx
==
0
)
{
localData
[
atom2
].
field
+=
fields
[
2
];
localData
[
atom2
].
fieldPolar
+=
fields
[
3
];
}
#else
#else
int
bufferIndex
=
3
*
threadIdx
.
x
;
const
unsigned
int
numTiles
=
numTileIndices
;
tempBuffer
[
bufferIndex
]
=
fields
[
2
].
x
;
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
tempBuffer
[
bufferIndex
+
1
]
=
fields
[
2
].
y
;
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
tempBuffer
[
bufferIndex
+
2
]
=
fields
[
2
].
z
;
if
(
tgx
%
4
==
0
)
{
tempBuffer
[
bufferIndex
]
+=
tempBuffer
[
bufferIndex
+
3
]
+
tempBuffer
[
bufferIndex
+
6
]
+
tempBuffer
[
bufferIndex
+
9
];
tempBuffer
[
bufferIndex
+
1
]
+=
tempBuffer
[
bufferIndex
+
4
]
+
tempBuffer
[
bufferIndex
+
7
]
+
tempBuffer
[
bufferIndex
+
10
];
tempBuffer
[
bufferIndex
+
2
]
+=
tempBuffer
[
bufferIndex
+
5
]
+
tempBuffer
[
bufferIndex
+
8
]
+
tempBuffer
[
bufferIndex
+
11
];
}
if
(
tgx
==
0
)
{
localData
[
atom2
].
field
.
x
+=
tempBuffer
[
bufferIndex
]
+
tempBuffer
[
bufferIndex
+
12
]
+
tempBuffer
[
bufferIndex
+
24
]
+
tempBuffer
[
bufferIndex
+
36
]
+
tempBuffer
[
bufferIndex
+
48
]
+
tempBuffer
[
bufferIndex
+
60
]
+
tempBuffer
[
bufferIndex
+
72
]
+
tempBuffer
[
bufferIndex
+
84
];
localData
[
atom2
].
field
.
y
+=
tempBuffer
[
bufferIndex
+
1
]
+
tempBuffer
[
bufferIndex
+
13
]
+
tempBuffer
[
bufferIndex
+
25
]
+
tempBuffer
[
bufferIndex
+
37
]
+
tempBuffer
[
bufferIndex
+
49
]
+
tempBuffer
[
bufferIndex
+
61
]
+
tempBuffer
[
bufferIndex
+
73
]
+
tempBuffer
[
bufferIndex
+
85
];
localData
[
atom2
].
field
.
z
+=
tempBuffer
[
bufferIndex
+
2
]
+
tempBuffer
[
bufferIndex
+
14
]
+
tempBuffer
[
bufferIndex
+
26
]
+
tempBuffer
[
bufferIndex
+
38
]
+
tempBuffer
[
bufferIndex
+
50
]
+
tempBuffer
[
bufferIndex
+
62
]
+
tempBuffer
[
bufferIndex
+
74
]
+
tempBuffer
[
bufferIndex
+
86
];
}
tempBuffer
[
bufferIndex
]
=
fields
[
3
].
x
;
tempBuffer
[
bufferIndex
+
1
]
=
fields
[
3
].
y
;
tempBuffer
[
bufferIndex
+
2
]
=
fields
[
3
].
z
;
if
(
tgx
%
4
==
0
)
{
tempBuffer
[
bufferIndex
]
+=
tempBuffer
[
bufferIndex
+
3
]
+
tempBuffer
[
bufferIndex
+
6
]
+
tempBuffer
[
bufferIndex
+
9
];
tempBuffer
[
bufferIndex
+
1
]
+=
tempBuffer
[
bufferIndex
+
4
]
+
tempBuffer
[
bufferIndex
+
7
]
+
tempBuffer
[
bufferIndex
+
10
];
tempBuffer
[
bufferIndex
+
2
]
+=
tempBuffer
[
bufferIndex
+
5
]
+
tempBuffer
[
bufferIndex
+
8
]
+
tempBuffer
[
bufferIndex
+
11
];
}
if
(
tgx
==
0
)
{
localData
[
atom2
].
fieldPolar
.
x
+=
tempBuffer
[
bufferIndex
]
+
tempBuffer
[
bufferIndex
+
12
]
+
tempBuffer
[
bufferIndex
+
24
]
+
tempBuffer
[
bufferIndex
+
36
]
+
tempBuffer
[
bufferIndex
+
48
]
+
tempBuffer
[
bufferIndex
+
60
]
+
tempBuffer
[
bufferIndex
+
72
]
+
tempBuffer
[
bufferIndex
+
84
];
localData
[
atom2
].
fieldPolar
.
y
+=
tempBuffer
[
bufferIndex
+
1
]
+
tempBuffer
[
bufferIndex
+
13
]
+
tempBuffer
[
bufferIndex
+
25
]
+
tempBuffer
[
bufferIndex
+
37
]
+
tempBuffer
[
bufferIndex
+
49
]
+
tempBuffer
[
bufferIndex
+
61
]
+
tempBuffer
[
bufferIndex
+
73
]
+
tempBuffer
[
bufferIndex
+
85
];
localData
[
atom2
].
fieldPolar
.
z
+=
tempBuffer
[
bufferIndex
+
2
]
+
tempBuffer
[
bufferIndex
+
14
]
+
tempBuffer
[
bufferIndex
+
26
]
+
tempBuffer
[
bufferIndex
+
38
]
+
tempBuffer
[
bufferIndex
+
50
]
+
tempBuffer
[
bufferIndex
+
62
]
+
tempBuffer
[
bufferIndex
+
74
]
+
tempBuffer
[
bufferIndex
+
86
];
}
#endif
#endif
}
int
skipBase
=
0
;
}
int
currentSkipIndex
=
tbx
;
}
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
}
__shared__
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
while
(
pos
<
end
)
{
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
}
else
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
AtomData
data
;
data
.
field
=
make_real3
(
0
);
data
.
fieldPolar
=
make_real3
(
0
);
#ifdef USE_GK
data
.
gkField
=
make_real3
(
0
);
#endif
loadAtomData
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
dampingAndThole
);
#ifdef USE_GK
data
.
bornRadius
=
bornRadii
[
atom1
];
#endif
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
#endif
{
atomIndices
[
threadIdx
.
x
]
=
j
;
// Compute the full set of interactions in this tile.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
loadAtomData
(
localData
[
localAtomIndex
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
dampingAndThole
);
uint2
covalent
=
(
hasExclusions
?
covalentFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
make_uint2
(
0
,
0
));
localData
[
localAtomIndex
].
field
=
make_real3
(
0
);
unsigned
int
polarizationGroup
=
(
hasExclusions
?
polarizationGroupFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
0
);
localData
[
localAtomIndex
].
fieldPolar
=
make_real3
(
0
);
unsigned
int
tj
=
tgx
;
#ifdef USE_GK
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
localData
[
localAtomIndex
].
bornRadius
=
bornRadii
[
j
];
real3
delta
=
trimTo3
(
localData
[
tbx
+
tj
].
posq
-
data
.
posq
);
localData
[
localAtomIndex
].
gkField
=
make_real3
(
0
);
#endif
// Compute the full set of interactions in this tile.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
trimTo3
(
localData
[
tbx
+
tj
].
posq
-
data
.
posq
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
#endif
int
atom2
=
y
*
TILE_SIZE
+
tj
;
int
atom2
=
atomIndices
[
tbx
+
tj
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real3
fields
[
4
];
real3
fields
[
4
];
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
computeOneInteraction
(
data
,
localData
[
tbx
+
tj
],
delta
,
1
,
1
,
fields
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
data
.
field
+=
fields
[
0
];
computeOneInteraction
(
data
,
localData
[
tbx
+
tj
],
delta
,
d
,
p
,
fields
);
data
.
fieldPolar
+=
fields
[
1
];
data
.
field
+=
fields
[
0
];
localData
[
tbx
+
tj
].
field
+=
fields
[
2
];
data
.
fieldPolar
+=
fields
[
1
];
localData
[
tbx
+
tj
].
fieldPolar
+=
fields
[
3
];
localData
[
tbx
+
tj
].
field
+=
fields
[
2
];
localData
[
tbx
+
tj
].
fieldPolar
+=
fields
[
3
];
#ifdef USE_GK
#ifdef USE_GK
computeOneGkInteraction
(
data
,
localData
[
tbx
+
tj
],
delta
,
fields
);
computeOneGkInteraction
(
data
,
localData
[
tbx
+
tj
],
delta
,
fields
);
data
.
gkField
+=
fields
[
0
];
data
.
gkField
+=
fields
[
0
];
localData
[
tbx
+
tj
].
gkField
+=
fields
[
1
];
localData
[
tbx
+
tj
].
gkField
+=
fields
[
1
];
#endif
#endif
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
// Write results.
// Write results.
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
if
(
pos
<
end
)
{
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
fieldBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
z
*
0x100000000
)));
atomicAdd
(
&
fieldBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
z
*
0x100000000
)));
...
@@ -648,9 +670,11 @@ extern "C" __global__ void computeFixedField(
...
@@ -648,9 +670,11 @@ extern "C" __global__ void computeFixedField(
atomicAdd
(
&
gkFieldBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
gkField
.
y
*
0x100000000
)));
atomicAdd
(
&
gkFieldBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
gkField
.
y
*
0x100000000
)));
atomicAdd
(
&
gkFieldBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
gkField
.
z
*
0x100000000
)));
atomicAdd
(
&
gkFieldBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
gkField
.
z
*
0x100000000
)));
#endif
#endif
}
#ifdef USE_CUTOFF
if
(
pos
<
end
&&
x
!=
y
)
{
offset
=
atomIndices
[
threadIdx
.
x
];
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
#else
offset
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomicAdd
(
&
fieldBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
z
*
0x100000000
)));
atomicAdd
(
&
fieldBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
z
*
0x100000000
)));
...
@@ -664,5 +688,5 @@ extern "C" __global__ void computeFixedField(
...
@@ -664,5 +688,5 @@ extern "C" __global__ void computeFixedField(
#endif
#endif
}
}
pos
++
;
pos
++
;
}
while
(
pos
<
end
);
}
}
}
plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
View file @
93c467b2
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef
struct
{
typedef
struct
{
...
@@ -199,194 +198,221 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
...
@@ -199,194 +198,221 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
* Compute the mutual induced field.
* Compute the mutual induced field.
*/
*/
extern
"C"
__global__
void
computeInducedField
(
extern
"C"
__global__
void
computeInducedField
(
unsigned
long
long
*
__restrict__
field
,
unsigned
long
long
*
__restrict__
fieldPolar
,
const
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
field
,
unsigned
long
long
*
__restrict__
fieldPolar
,
const
real4
*
__restrict__
posq
,
const
ushort2
*
__restrict__
exclusionTiles
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
,
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
,
#elif defined USE_GK
#elif defined USE_GK
unsigned
long
long
*
__restrict__
fieldS
,
unsigned
long
long
*
__restrict__
fieldPolarS
,
const
real
*
__restrict__
inducedDipoleS
,
unsigned
long
long
*
__restrict__
fieldS
,
unsigned
long
long
*
__restrict__
fieldPolarS
,
const
real
*
__restrict__
inducedDipoleS
,
const
real
*
__restrict__
inducedDipolePolarS
,
const
real
*
__restrict__
bornRadii
,
const
real
*
__restrict__
inducedDipolePolarS
,
const
real
*
__restrict__
bornRadii
,
#endif
#endif
const
float2
*
__restrict__
dampingAndThole
)
{
const
float2
*
__restrict__
dampingAndThole
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
numTiles
=
interactionCount
[
0
];
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
unsigned
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
warp
+
1
)
*
numTileIndices
/
totalWarps
:
(
warp
+
1
)
*
numTiles
/
totalWarps
);
#else
const
unsigned
int
numTiles
=
numTileIndices
;
unsigned
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
#ifndef ENABLE_SHUFFLE
// __shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
// First loop: process tiles that contain exclusions.
#endif
do
{
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
// Extract the coordinates of this tile
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
unsigned
int
x
,
y
;
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
AtomData
data
;
AtomData
data
;
zeroAtomData
(
data
);
zeroAtomData
(
data
);
if
(
pos
<
end
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
#ifdef USE_GK
#ifdef USE_GK
loadAtomData
(
data
,
atom1
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
,
inducedDipoleS
,
inducedDipolePolarS
,
bornRadii
);
loadAtomData
(
data
,
atom1
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
,
inducedDipoleS
,
inducedDipolePolarS
,
bornRadii
);
#else
#else
loadAtomData
(
data
,
atom1
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
loadAtomData
(
data
,
atom1
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
#endif
#endif
if
(
pos
>=
end
)
if
(
x
==
y
)
{
;
// This warp is done.
// This tile is on the diagonal.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
pos
=
data
.
pos
;
localData
[
threadIdx
.
x
].
inducedDipole
=
data
.
inducedDipole
;
localData
[
threadIdx
.
x
].
pos
=
data
.
pos
;
localData
[
threadIdx
.
x
].
inducedDipolePolar
=
data
.
inducedDipolePolar
;
localData
[
threadIdx
.
x
].
inducedDipole
=
data
.
inducedDipole
;
localData
[
threadIdx
.
x
].
thole
=
data
.
thole
;
localData
[
threadIdx
.
x
].
inducedDipolePolar
=
data
.
inducedDipolePolar
;
localData
[
threadIdx
.
x
].
damp
=
data
.
damp
;
localData
[
threadIdx
.
x
].
thole
=
data
.
thole
;
localData
[
threadIdx
.
x
].
damp
=
data
.
damp
;
#ifdef USE_GK
#ifdef USE_GK
localData
[
threadIdx
.
x
].
inducedDipoleS
=
data
.
inducedDipoleS
;
localData
[
threadIdx
.
x
].
inducedDipoleS
=
data
.
inducedDipoleS
;
localData
[
threadIdx
.
x
].
inducedDipolePolarS
=
data
.
inducedDipolePolarS
;
localData
[
threadIdx
.
x
].
inducedDipolePolarS
=
data
.
inducedDipolePolarS
;
localData
[
threadIdx
.
x
].
bornRadius
=
data
.
bornRadius
;
localData
[
threadIdx
.
x
].
bornRadius
=
data
.
bornRadius
;
#endif
#endif
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
localData
[
tbx
+
j
].
pos
-
data
.
pos
;
real3
delta
=
localData
[
tbx
+
j
].
pos
-
data
.
pos
;
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
#endif
int
atom2
=
y
*
TILE_SIZE
+
j
;
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
computeOneInteraction
(
data
,
localData
[
tbx
+
j
],
delta
,
atom1
==
atom2
);
computeOneInteraction
(
data
,
localData
[
tbx
+
j
],
delta
,
atom1
==
atom2
);
}
}
}
else
{
}
// This is an off-diagonal tile.
else
{
// This is an off-diagonal tile.
#ifdef USE_GK
#ifdef USE_GK
loadAtomData
(
localData
[
threadIdx
.
x
],
y
*
TILE_SIZE
+
tgx
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
,
inducedDipoleS
,
inducedDipolePolarS
,
bornRadii
);
loadAtomData
(
localData
[
threadIdx
.
x
],
y
*
TILE_SIZE
+
tgx
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
,
inducedDipoleS
,
inducedDipolePolarS
,
bornRadii
);
#else
#else
loadAtomData
(
localData
[
threadIdx
.
x
],
y
*
TILE_SIZE
+
tgx
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
loadAtomData
(
localData
[
threadIdx
.
x
],
y
*
TILE_SIZE
+
tgx
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
#endif
#endif
zeroAtomData
(
localData
[
threadIdx
.
x
]);
zeroAtomData
(
localData
[
threadIdx
.
x
]);
#ifdef USE_CUTOFF
unsigned
int
tj
=
tgx
;
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
(
flags
==
0
)
{
// TODO: Figure out what the flags != 0 case doesn't work!!!
real3
delta
=
localData
[
tbx
+
tj
].
pos
-
data
.
pos
;
// if (flags != 0xFFFFFFFF) {
if
(
flags
==
0
)
{
// No interactions in this tile.
}
/* else {
// Compute only a subset of the interactions in this tile.
for (int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
real3 delta = localData[atom2].pos-data.pos;
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
computeOneInteraction
(
data
,
localData
[
tbx
+
tj
],
delta
,
false
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
field
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
x
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
y
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
z
*
0x100000000
)));
atomicAdd
(
&
fieldPolar
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolar
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldPolar
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolar
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldPolar
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolar
.
z
*
0x100000000
)));
#ifdef USE_GK
atomicAdd
(
&
fieldS
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldS
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldS
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldS
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldS
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldS
.
z
*
0x100000000
)));
atomicAdd
(
&
fieldPolarS
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolarS
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldPolarS
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolarS
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldPolarS
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolarS
.
z
*
0x100000000
)));
#endif
if
(
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
field
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
x
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
y
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
z
*
0x100000000
)));
atomicAdd
(
&
fieldPolar
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldPolar
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldPolar
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldPolar
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldPolar
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldPolar
.
z
*
0x100000000
)));
#ifdef USE_GK
atomicAdd
(
&
fieldS
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldS
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldS
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldS
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldS
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldS
.
z
*
0x100000000
)));
atomicAdd
(
&
fieldPolarS
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldPolarS
.
x
*
0x100000000
)));
atomicAdd
(
&
fieldPolarS
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldPolarS
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldPolarS
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fieldPolarS
.
z
*
0x100000000
)));
#endif
#endif
real3 fields[4];
}
computeOneInteraction(data, localData[atom2], delta, fields);
}
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
data.field += fields[0];
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
data.fieldPolar += fields[1];
// of them (no cutoff).
#ifdef ENABLE_SHUFFLE
for (int i = 16; i >= 1; i /= 2) {
#ifdef USE_CUTOFF
fields[2].x += __shfl_xor(fields[2].x, i, 32);
const
unsigned
int
numTiles
=
interactionCount
[
0
];
fields[2].y += __shfl_xor(fields[2].y, i, 32);
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
fields[2].z += __shfl_xor(fields[2].z, i, 32);
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
warp
+
1
)
*
numTileIndices
/
totalWarps
:
(
warp
+
1
)
*
numTiles
/
totalWarps
);
fields[3].x += __shfl_xor(fields[3].x, i, 32);
fields[3].y += __shfl_xor(fields[3].y, i, 32);
fields[3].z += __shfl_xor(fields[3].z, i, 32);
}
if (tgx == 0) {
localData[atom2].field += fields[2];
localData[atom2].fieldPolar += fields[3];
}
#else
#else
int bufferIndex = 3*threadIdx.x;
const
unsigned
int
numTiles
=
numTileIndices
;
tempBuffer[bufferIndex] = fields[2].x;
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
tempBuffer[bufferIndex+1] = fields[2].y;
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
tempBuffer[bufferIndex+2] = fields[2].z;
#endif
if (tgx % 4 == 0) {
int
skipBase
=
0
;
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
int
currentSkipIndex
=
tbx
;
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
__shared__
int
skipTiles
[
THREAD_BLOCK_SIZE
];
}
skipTiles
[
threadIdx
.
x
]
=
-
1
;
if (tgx == 0) {
localData[atom2].field.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
while
(
pos
<
end
)
{
localData[atom2].field.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
bool
includeTile
=
true
;
localData[atom2].field.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
// Extract the coordinates of this tile.
tempBuffer[bufferIndex] = fields[3].x;
tempBuffer[bufferIndex+1] = fields[3].y;
unsigned
int
x
,
y
;
tempBuffer[bufferIndex+2] = fields[3].z;
#ifdef USE_CUTOFF
if (tgx % 4 == 0) {
if
(
numTiles
<=
maxTiles
)
{
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
ushort2
tileIndices
=
tiles
[
pos
];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
x
=
tileIndices
.
x
;
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
}
else
if (tgx == 0) {
localData[atom2].fieldPolar.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[atom2].fieldPolar.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[atom2].fieldPolar.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
#endif
#endif
}
{
}
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
}
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}*/
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
}
else
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
AtomData
data
;
zeroAtomData
(
data
);
#ifdef USE_GK
loadAtomData
(
data
,
atom1
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
,
inducedDipoleS
,
inducedDipolePolarS
,
bornRadii
);
#else
loadAtomData
(
data
,
atom1
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
#endif
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
#endif
{
atomIndices
[
threadIdx
.
x
]
=
j
;
// Compute the full set of interactions in this tile.
#ifdef USE_GK
loadAtomData
(
localData
[
threadIdx
.
x
],
j
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
,
inducedDipoleS
,
inducedDipolePolarS
,
bornRadii
);
#else
loadAtomData
(
localData
[
threadIdx
.
x
],
j
,
posq
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
#endif
zeroAtomData
(
localData
[
threadIdx
.
x
]);
unsigned
int
tj
=
tgx
;
// Compute the full set of interactions in this tile.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
localData
[
tbx
+
tj
].
pos
-
data
.
pos
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
localData
[
tbx
+
tj
].
pos
-
data
.
pos
;
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
#endif
int
atom2
=
y
*
TILE_SIZE
+
j
;
int
atom2
=
atomIndices
[
tbx
+
tj
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
computeOneInteraction
(
data
,
localData
[
tbx
+
tj
],
delta
,
false
);
computeOneInteraction
(
data
,
localData
[
tbx
+
tj
],
delta
,
false
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
}
// Write results.
// Write results.
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
if
(
pos
<
end
)
{
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
field
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
x
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
x
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
y
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
y
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
z
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
field
.
z
*
0x100000000
)));
...
@@ -401,9 +427,11 @@ extern "C" __global__ void computeInducedField(
...
@@ -401,9 +427,11 @@ extern "C" __global__ void computeInducedField(
atomicAdd
(
&
fieldPolarS
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolarS
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldPolarS
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolarS
.
y
*
0x100000000
)));
atomicAdd
(
&
fieldPolarS
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolarS
.
z
*
0x100000000
)));
atomicAdd
(
&
fieldPolarS
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
fieldPolarS
.
z
*
0x100000000
)));
#endif
#endif
}
#ifdef USE_CUTOFF
if
(
pos
<
end
&&
x
!=
y
)
{
offset
=
atomIndices
[
threadIdx
.
x
];
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
#else
offset
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomicAdd
(
&
field
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
x
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
x
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
y
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
y
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
z
*
0x100000000
)));
atomicAdd
(
&
field
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
field
.
z
*
0x100000000
)));
...
@@ -420,7 +448,7 @@ extern "C" __global__ void computeInducedField(
...
@@ -420,7 +448,7 @@ extern "C" __global__ void computeInducedField(
#endif
#endif
}
}
pos
++
;
pos
++
;
}
while
(
pos
<
end
);
}
}
}
extern
"C"
__global__
void
updateInducedFieldBySOR
(
const
long
long
*
__restrict__
fixedField
,
const
long
long
*
__restrict__
fixedFieldPolar
,
extern
"C"
__global__
void
updateInducedFieldBySOR
(
const
long
long
*
__restrict__
fixedField
,
const
long
long
*
__restrict__
fixedFieldPolar
,
...
...
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
View file @
93c467b2
#define ARRAY(x,y) array[(x)-1+((y)-1)*PME_ORDER]
#define ARRAY(x,y) array[(x)-1+((y)-1)*PME_ORDER]
/**
/**
*
This is called from updateBsplines(). It c
alculate
s
the spline coefficients for a single atom along a single axis.
*
C
alculate the spline coefficients for a single atom along a single axis.
*/
*/
__device__
void
computeBSplinePoint
(
real4
*
thetai
,
real
w
,
real
*
array
)
{
__device__
void
computeBSplinePoint
(
real4
*
thetai
,
real
w
,
real
*
array
)
{
// initialization to get to 2nd order recursion
// initialization to get to 2nd order recursion
...
@@ -70,15 +70,10 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
...
@@ -70,15 +70,10 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
}
}
/**
/**
* Compute
bspline coefficients
.
* Compute
the index of the grid point each atom is associated with
.
*/
*/
extern
"C"
__global__
void
updateBsplines
(
const
real4
*
__restrict__
posq
,
int4
*
__restrict__
igrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
extern
"C"
__global__
void
findAtomGridIndex
(
const
real4
*
__restrict__
posq
,
int2
*
__restrict__
pmeAtomGridIndex
,
real4
*
__restrict__
theta1
,
real4
*
__restrict__
theta2
,
real4
*
__restrict__
theta3
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
extern
__shared__
real
bsplines_cache
[];
// size = block_size*pme_order*pme_order
real
*
array
=
&
bsplines_cache
[
threadIdx
.
x
*
PME_ORDER
*
PME_ORDER
];
// get the B-spline coefficients for each multipole site
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
real4
pos
=
posq
[
i
];
real4
pos
=
posq
[
i
];
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
...
@@ -90,256 +85,226 @@ extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, int4*
...
@@ -90,256 +85,226 @@ extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, int4*
real
w
=
pos
.
x
*
invPeriodicBoxSize
.
x
;
real
w
=
pos
.
x
*
invPeriodicBoxSize
.
x
;
real
fr
=
GRID_SIZE_X
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
real
fr
=
GRID_SIZE_X
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
int
ifr
=
(
int
)
fr
;
int
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid1
=
ifr
-
PME_ORDER
+
1
;
int
igrid1
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
&
theta1
[
i
*
PME_ORDER
],
w
,
array
);
// Second axis.
// Second axis.
w
=
pos
.
y
*
invPeriodicBoxSize
.
y
;
w
=
pos
.
y
*
invPeriodicBoxSize
.
y
;
fr
=
GRID_SIZE_Y
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
fr
=
GRID_SIZE_Y
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid2
=
ifr
-
PME_ORDER
+
1
;
int
igrid2
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
&
theta2
[
i
*
PME_ORDER
],
w
,
array
);
// Third axis.
// Third axis.
w
=
pos
.
z
*
invPeriodicBoxSize
.
z
;
w
=
pos
.
z
*
invPeriodicBoxSize
.
z
;
fr
=
GRID_SIZE_Z
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
fr
=
GRID_SIZE_Z
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid3
=
ifr
-
PME_ORDER
+
1
;
int
igrid3
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
&
theta3
[
i
*
PME_ORDER
],
w
,
array
);
// Record the grid point.
// Record the grid point.
igrid1
+=
(
igrid1
<
0
?
GRID_SIZE_X
:
0
);
igrid1
+=
(
igrid1
<
0
?
GRID_SIZE_X
:
0
);
igrid2
+=
(
igrid2
<
0
?
GRID_SIZE_Y
:
0
);
igrid2
+=
(
igrid2
<
0
?
GRID_SIZE_Y
:
0
);
igrid3
+=
(
igrid3
<
0
?
GRID_SIZE_Z
:
0
);
igrid3
+=
(
igrid3
<
0
?
GRID_SIZE_Z
:
0
);
igrid
[
i
]
=
make_int4
(
igrid1
,
igrid2
,
igrid3
,
0
);
pmeAtomGridIndex
[
i
]
=
make_int2
(
i
,
igrid1
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
igrid2
*
GRID_SIZE_Z
+
igrid3
);
pmeAtomGridIndex
[
i
]
=
make_int2
(
i
,
igrid1
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
igrid2
*
GRID_SIZE_Z
+
igrid3
);
}
}
}
}
/**
* For each grid point, find the range of sorted atoms associated with that point.
*/
extern
"C"
__global__
void
findAtomRangeForGrid
(
int2
*
__restrict__
pmeAtomGridIndex
,
int
*
__restrict__
pmeAtomRange
,
const
real4
*
__restrict__
posq
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
int
thread
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
start
=
(
NUM_ATOMS
*
thread
)
/
(
blockDim
.
x
*
gridDim
.
x
);
int
end
=
(
NUM_ATOMS
*
(
thread
+
1
))
/
(
blockDim
.
x
*
gridDim
.
x
);
int
last
=
(
start
==
0
?
-
1
:
pmeAtomGridIndex
[
start
-
1
].
y
);
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
int2
atomData
=
pmeAtomGridIndex
[
i
];
int
gridIndex
=
atomData
.
y
;
if
(
gridIndex
!=
last
)
{
for
(
int
j
=
last
+
1
;
j
<=
gridIndex
;
++
j
)
pmeAtomRange
[
j
]
=
i
;
last
=
gridIndex
;
}
}
// Fill in values beyond the last atom.
if
(
thread
==
blockDim
.
x
*
gridDim
.
x
-
1
)
{
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
for
(
int
j
=
last
+
1
;
j
<=
gridSize
;
++
j
)
pmeAtomRange
[
j
]
=
NUM_ATOMS
;
}
}
/**
* The grid index won't be needed again. Reuse that component to hold the z index, thus saving
* some work in the charge spreading kernel.
*/
extern
"C"
__global__
void
recordZIndex
(
int2
*
__restrict__
pmeAtomGridIndex
,
const
real4
*
__restrict__
posq
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
int
thread
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
start
=
(
NUM_ATOMS
*
thread
)
/
(
blockDim
.
x
*
gridDim
.
x
);
int
end
=
(
NUM_ATOMS
*
(
thread
+
1
))
/
(
blockDim
.
x
*
gridDim
.
x
);
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
real
posz
=
posq
[
pmeAtomGridIndex
[
i
].
x
].
z
;
posz
-=
floor
(
posz
*
invPeriodicBoxSize
.
z
)
*
periodicBoxSize
.
z
;
real
w
=
posz
*
invPeriodicBoxSize
.
z
;
real
fr
=
GRID_SIZE_Z
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
int
z
=
((
int
)
fr
)
-
PME_ORDER
+
1
;
pmeAtomGridIndex
[
i
].
y
=
z
;
}
}
extern
"C"
__global__
void
gridSpreadFixedMultipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
labFrameDipole
,
extern
"C"
__global__
void
gridSpreadFixedMultipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
int
*
__restrict__
pmeAtomRange
,
const
real
*
__restrict__
labFrameQuadrupole
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
const
real4
*
__restrict__
theta1
,
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
real4
invPeriodicBoxSize
)
{
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
const
real
xscale
=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
;
const
real
xscale
=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
;
const
real
yscale
=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
;
const
real
yscale
=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
;
const
real
zscale
=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
;
const
real
zscale
=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
;
unsigned
int
numGridPoints
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real
array
[
PME_ORDER
*
PME_ORDER
];
unsigned
int
numThreads
=
gridDim
.
x
*
blockDim
.
x
;
real4
theta1
[
PME_ORDER
];
for
(
int
gridIndex
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
gridIndex
<
numGridPoints
;
gridIndex
+=
numThreads
)
{
real4
theta2
[
PME_ORDER
];
int3
gridPoint
;
real4
theta3
[
PME_ORDER
];
gridPoint
.
x
=
gridIndex
/
(
GRID_SIZE_Y
*
GRID_SIZE_Z
);
int
remainder
=
gridIndex
-
gridPoint
.
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
// Process the atoms in spatially sorted order. This improves cache performance when loading
gridPoint
.
y
=
remainder
/
GRID_SIZE_Z
;
// the grid values.
gridPoint
.
z
=
remainder
-
gridPoint
.
y
*
GRID_SIZE_Z
;
real
result
=
0
;
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
++
ix
)
{
int
m
=
pmeAtomGridIndex
[
i
].
x
;
int
x
=
gridPoint
.
x
-
ix
+
(
gridPoint
.
x
>=
ix
?
0
:
GRID_SIZE_X
);
real4
pos
=
posq
[
m
];
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
++
iy
)
{
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
int
y
=
gridPoint
.
y
-
iy
+
(
gridPoint
.
y
>=
iy
?
0
:
GRID_SIZE_Y
);
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
int
z1
=
gridPoint
.
z
-
PME_ORDER
+
1
;
pos
.
z
-=
floor
(
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
periodicBoxSize
.
z
;
z1
+=
(
z1
>=
0
?
0
:
GRID_SIZE_Z
);
int
z2
=
(
z1
<
gridPoint
.
z
?
gridPoint
.
z
:
GRID_SIZE_Z
-
1
);
// Since we need the full set of thetas, it's faster to compute them here than load them
int
gridIndex1
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
z1
;
// from global memory.
int
gridIndex2
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
z2
;
int
firstAtom
=
pmeAtomRange
[
gridIndex1
];
real
w
=
pos
.
x
*
invPeriodicBoxSize
.
x
;
int
lastAtom
=
pmeAtomRange
[
gridIndex2
+
1
];
real
fr
=
GRID_SIZE_X
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
int
ifr
=
(
int
)
fr
;
int2
atomData
=
pmeAtomGridIndex
[
i
];
w
=
fr
-
ifr
;
int
atomIndex
=
atomData
.
x
;
int
igrid1
=
ifr
-
PME_ORDER
+
1
;
int
z
=
atomData
.
y
;
computeBSplinePoint
(
theta1
,
w
,
array
);
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
GRID_SIZE_Z
);
w
=
pos
.
y
*
invPeriodicBoxSize
.
y
;
if
(
iz
>=
GRID_SIZE_Z
)
fr
=
GRID_SIZE_Y
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
iz
-=
GRID_SIZE_Z
;
ifr
=
(
int
)
fr
;
real
atomCharge
=
posq
[
atomIndex
].
w
;
w
=
fr
-
ifr
;
real
atomDipoleX
=
xscale
*
labFrameDipole
[
atomIndex
*
3
];
int
igrid2
=
ifr
-
PME_ORDER
+
1
;
real
atomDipoleY
=
yscale
*
labFrameDipole
[
atomIndex
*
3
+
1
];
computeBSplinePoint
(
theta2
,
w
,
array
);
real
atomDipoleZ
=
zscale
*
labFrameDipole
[
atomIndex
*
3
+
2
];
w
=
pos
.
z
*
invPeriodicBoxSize
.
z
;
real
atomQuadrupoleXX
=
xscale
*
xscale
*
labFrameQuadrupole
[
atomIndex
*
5
];
fr
=
GRID_SIZE_Z
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
real
atomQuadrupoleXY
=
2
*
xscale
*
yscale
*
labFrameQuadrupole
[
atomIndex
*
5
+
1
];
ifr
=
(
int
)
fr
;
real
atomQuadrupoleXZ
=
2
*
xscale
*
zscale
*
labFrameQuadrupole
[
atomIndex
*
5
+
2
];
w
=
fr
-
ifr
;
real
atomQuadrupoleYY
=
yscale
*
yscale
*
labFrameQuadrupole
[
atomIndex
*
5
+
3
];
int
igrid3
=
ifr
-
PME_ORDER
+
1
;
real
atomQuadrupoleYZ
=
2
*
yscale
*
zscale
*
labFrameQuadrupole
[
atomIndex
*
5
+
4
];
computeBSplinePoint
(
theta3
,
w
,
array
);
real
atomQuadrupoleZZ
=
-
zscale
*
zscale
*
(
labFrameQuadrupole
[
atomIndex
*
5
]
+
labFrameQuadrupole
[
atomIndex
*
5
+
3
]);
igrid1
+=
(
igrid1
<
0
?
GRID_SIZE_X
:
0
);
real4
t
=
theta1
[
atomIndex
*
PME_ORDER
+
ix
];
igrid2
+=
(
igrid2
<
0
?
GRID_SIZE_Y
:
0
);
real4
u
=
theta2
[
atomIndex
*
PME_ORDER
+
iy
];
igrid3
+=
(
igrid3
<
0
?
GRID_SIZE_Z
:
0
);
real4
v
=
theta3
[
atomIndex
*
PME_ORDER
+
iz
];
// Spread the charge from this atom onto each grid point.
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
igrid1
+
ix
;
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
xbase
=
xbase
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real4
t
=
theta1
[
ix
];
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
igrid2
+
iy
;
ybase
-=
(
ybase
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
ybase
=
xbase
+
ybase
*
GRID_SIZE_Z
;
real4
u
=
theta2
[
iy
];
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
zindex
=
igrid3
+
iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
int
index
=
ybase
+
zindex
;
real4
v
=
theta3
[
iz
];
real
atomCharge
=
pos
.
w
;
real
atomDipoleX
=
xscale
*
labFrameDipole
[
m
*
3
];
real
atomDipoleY
=
yscale
*
labFrameDipole
[
m
*
3
+
1
];
real
atomDipoleZ
=
zscale
*
labFrameDipole
[
m
*
3
+
2
];
real
atomQuadrupoleXX
=
xscale
*
xscale
*
labFrameQuadrupole
[
m
*
5
];
real
atomQuadrupoleXY
=
2
*
xscale
*
yscale
*
labFrameQuadrupole
[
m
*
5
+
1
];
real
atomQuadrupoleXZ
=
2
*
xscale
*
zscale
*
labFrameQuadrupole
[
m
*
5
+
2
];
real
atomQuadrupoleYY
=
yscale
*
yscale
*
labFrameQuadrupole
[
m
*
5
+
3
];
real
atomQuadrupoleYZ
=
2
*
yscale
*
zscale
*
labFrameQuadrupole
[
m
*
5
+
4
];
real
atomQuadrupoleZZ
=
-
zscale
*
zscale
*
(
labFrameQuadrupole
[
m
*
5
]
+
labFrameQuadrupole
[
m
*
5
+
3
]);
real
term0
=
atomCharge
*
u
.
x
*
v
.
x
+
atomDipoleY
*
u
.
y
*
v
.
x
+
atomDipoleZ
*
u
.
x
*
v
.
y
+
atomQuadrupoleYY
*
u
.
z
*
v
.
x
+
atomQuadrupoleZZ
*
u
.
x
*
v
.
z
+
atomQuadrupoleYZ
*
u
.
y
*
v
.
y
;
real
term0
=
atomCharge
*
u
.
x
*
v
.
x
+
atomDipoleY
*
u
.
y
*
v
.
x
+
atomDipoleZ
*
u
.
x
*
v
.
y
+
atomQuadrupoleYY
*
u
.
z
*
v
.
x
+
atomQuadrupoleZZ
*
u
.
x
*
v
.
z
+
atomQuadrupoleYZ
*
u
.
y
*
v
.
y
;
real
term1
=
atomDipoleX
*
u
.
x
*
v
.
x
+
atomQuadrupoleXY
*
u
.
y
*
v
.
x
+
atomQuadrupoleXZ
*
u
.
x
*
v
.
y
;
real
term1
=
atomDipoleX
*
u
.
x
*
v
.
x
+
atomQuadrupoleXY
*
u
.
y
*
v
.
x
+
atomQuadrupoleXZ
*
u
.
x
*
v
.
y
;
real
term2
=
atomQuadrupoleXX
*
u
.
x
*
v
.
x
;
real
term2
=
atomQuadrupoleXX
*
u
.
x
*
v
.
x
;
result
+=
term0
*
t
.
x
+
term1
*
t
.
y
+
term2
*
t
.
z
;
real
add
=
term0
*
t
.
x
+
term1
*
t
.
y
+
term2
*
t
.
z
;
}
#ifdef USE_DOUBLE_PRECISION
if
(
z1
>
gridPoint
.
z
)
{
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
pmeGrid
;
gridIndex1
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
;
atomicAdd
(
&
ulonglong_p
[
2
*
index
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
gridIndex2
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
gridPoint
.
z
;
#else
firstAtom
=
pmeAtomRange
[
gridIndex1
];
atomicAdd
(
&
pmeGrid
[
index
].
x
,
add
);
lastAtom
=
pmeAtomRange
[
gridIndex2
+
1
];
#endif
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
int2
atomData
=
pmeAtomGridIndex
[
i
];
int
atomIndex
=
atomData
.
x
;
int
z
=
atomData
.
y
;
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
GRID_SIZE_Z
);
if
(
iz
>=
GRID_SIZE_Z
)
iz
-=
GRID_SIZE_Z
;
real
atomCharge
=
posq
[
atomIndex
].
w
;
real
atomDipoleX
=
xscale
*
labFrameDipole
[
atomIndex
*
3
];
real
atomDipoleY
=
yscale
*
labFrameDipole
[
atomIndex
*
3
+
1
];
real
atomDipoleZ
=
zscale
*
labFrameDipole
[
atomIndex
*
3
+
2
];
real
atomQuadrupoleXX
=
xscale
*
xscale
*
labFrameQuadrupole
[
atomIndex
*
5
];
real
atomQuadrupoleXY
=
2
*
xscale
*
yscale
*
labFrameQuadrupole
[
atomIndex
*
5
+
1
];
real
atomQuadrupoleXZ
=
2
*
xscale
*
zscale
*
labFrameQuadrupole
[
atomIndex
*
5
+
2
];
real
atomQuadrupoleYY
=
yscale
*
yscale
*
labFrameQuadrupole
[
atomIndex
*
5
+
3
];
real
atomQuadrupoleYZ
=
2
*
yscale
*
zscale
*
labFrameQuadrupole
[
atomIndex
*
5
+
4
];
real
atomQuadrupoleZZ
=
-
zscale
*
zscale
*
(
labFrameQuadrupole
[
atomIndex
*
5
]
+
labFrameQuadrupole
[
atomIndex
*
5
+
3
]);
real4
t
=
theta1
[
atomIndex
*
PME_ORDER
+
ix
];
real4
u
=
theta2
[
atomIndex
*
PME_ORDER
+
iy
];
real4
v
=
theta3
[
atomIndex
*
PME_ORDER
+
iz
];
real
term0
=
atomCharge
*
u
.
x
*
v
.
x
+
atomDipoleY
*
u
.
y
*
v
.
x
+
atomDipoleZ
*
u
.
x
*
v
.
y
+
atomQuadrupoleYY
*
u
.
z
*
v
.
x
+
atomQuadrupoleZZ
*
u
.
x
*
v
.
z
+
atomQuadrupoleYZ
*
u
.
y
*
v
.
y
;
real
term1
=
atomDipoleX
*
u
.
x
*
v
.
x
+
atomQuadrupoleXY
*
u
.
y
*
v
.
x
+
atomQuadrupoleXZ
*
u
.
x
*
v
.
y
;
real
term2
=
atomQuadrupoleXX
*
u
.
x
*
v
.
x
;
result
+=
term0
*
t
.
x
+
term1
*
t
.
y
+
term2
*
t
.
z
;
}
}
}
}
}
}
}
pmeGrid
[
gridIndex
]
=
make_real2
(
result
,
0
);
}
}
}
}
extern
"C"
__global__
void
gridSpreadInducedDipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
inducedDipole
,
extern
"C"
__global__
void
gridSpreadInducedDipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
int
*
__restrict__
pmeAtomRange
,
const
real
*
__restrict__
inducedDipolePolar
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
const
real4
*
__restrict__
theta1
,
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
real4
invPeriodicBoxSize
)
{
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
const
real
xscale
=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
;
const
real
xscale
=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
;
const
real
yscale
=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
;
const
real
yscale
=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
;
const
real
zscale
=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
;
const
real
zscale
=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
;
unsigned
int
numGridPoints
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real
array
[
PME_ORDER
*
PME_ORDER
];
unsigned
int
numThreads
=
gridDim
.
x
*
blockDim
.
x
;
real4
theta1
[
PME_ORDER
];
for
(
int
gridIndex
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
gridIndex
<
numGridPoints
;
gridIndex
+=
numThreads
)
{
real4
theta2
[
PME_ORDER
];
int3
gridPoint
;
real4
theta3
[
PME_ORDER
];
gridPoint
.
x
=
gridIndex
/
(
GRID_SIZE_Y
*
GRID_SIZE_Z
);
int
remainder
=
gridIndex
-
gridPoint
.
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
// Process the atoms in spatially sorted order. This improves cache performance when loading
gridPoint
.
y
=
remainder
/
GRID_SIZE_Z
;
// the grid values.
gridPoint
.
z
=
remainder
-
gridPoint
.
y
*
GRID_SIZE_Z
;
real2
result
=
make_real2
(
0
,
0
);
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
++
ix
)
{
int
m
=
pmeAtomGridIndex
[
i
].
x
;
int
x
=
gridPoint
.
x
-
ix
+
(
gridPoint
.
x
>=
ix
?
0
:
GRID_SIZE_X
);
real4
pos
=
posq
[
m
];
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
++
iy
)
{
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
int
y
=
gridPoint
.
y
-
iy
+
(
gridPoint
.
y
>=
iy
?
0
:
GRID_SIZE_Y
);
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
int
z1
=
gridPoint
.
z
-
PME_ORDER
+
1
;
pos
.
z
-=
floor
(
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
periodicBoxSize
.
z
;
z1
+=
(
z1
>=
0
?
0
:
GRID_SIZE_Z
);
int
z2
=
(
z1
<
gridPoint
.
z
?
gridPoint
.
z
:
GRID_SIZE_Z
-
1
);
// Since we need the full set of thetas, it's faster to compute them here than load them
int
gridIndex1
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
z1
;
// from global memory.
int
gridIndex2
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
z2
;
int
firstAtom
=
pmeAtomRange
[
gridIndex1
];
real
w
=
pos
.
x
*
invPeriodicBoxSize
.
x
;
int
lastAtom
=
pmeAtomRange
[
gridIndex2
+
1
];
real
fr
=
GRID_SIZE_X
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
int
ifr
=
(
int
)
fr
;
int2
atomData
=
pmeAtomGridIndex
[
i
];
w
=
fr
-
ifr
;
int
atomIndex
=
atomData
.
x
;
int
igrid1
=
ifr
-
PME_ORDER
+
1
;
int
z
=
atomData
.
y
;
computeBSplinePoint
(
theta1
,
w
,
array
);
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
GRID_SIZE_Z
);
w
=
pos
.
y
*
invPeriodicBoxSize
.
y
;
if
(
iz
>=
GRID_SIZE_Z
)
fr
=
GRID_SIZE_Y
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
iz
-=
GRID_SIZE_Z
;
ifr
=
(
int
)
fr
;
real
inducedDipoleX
=
xscale
*
inducedDipole
[
atomIndex
*
3
];
w
=
fr
-
ifr
;
real
inducedDipoleY
=
yscale
*
inducedDipole
[
atomIndex
*
3
+
1
];
int
igrid2
=
ifr
-
PME_ORDER
+
1
;
real
inducedDipoleZ
=
zscale
*
inducedDipole
[
atomIndex
*
3
+
2
];
computeBSplinePoint
(
theta2
,
w
,
array
);
real
inducedDipolePolarX
=
xscale
*
inducedDipolePolar
[
atomIndex
*
3
];
w
=
pos
.
z
*
invPeriodicBoxSize
.
z
;
real
inducedDipolePolarY
=
yscale
*
inducedDipolePolar
[
atomIndex
*
3
+
1
];
fr
=
GRID_SIZE_Z
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
real
inducedDipolePolarZ
=
zscale
*
inducedDipolePolar
[
atomIndex
*
3
+
2
];
ifr
=
(
int
)
fr
;
real4
t
=
theta1
[
atomIndex
*
PME_ORDER
+
ix
];
w
=
fr
-
ifr
;
real4
u
=
theta2
[
atomIndex
*
PME_ORDER
+
iy
];
int
igrid3
=
ifr
-
PME_ORDER
+
1
;
real4
v
=
theta3
[
atomIndex
*
PME_ORDER
+
iz
];
computeBSplinePoint
(
theta3
,
w
,
array
);
igrid1
+=
(
igrid1
<
0
?
GRID_SIZE_X
:
0
);
igrid2
+=
(
igrid2
<
0
?
GRID_SIZE_Y
:
0
);
igrid3
+=
(
igrid3
<
0
?
GRID_SIZE_Z
:
0
);
// Spread the charge from this atom onto each grid point.
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
igrid1
+
ix
;
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
xbase
=
xbase
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real4
t
=
theta1
[
ix
];
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
igrid2
+
iy
;
ybase
-=
(
ybase
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
ybase
=
xbase
+
ybase
*
GRID_SIZE_Z
;
real4
u
=
theta2
[
iy
];
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
zindex
=
igrid3
+
iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
int
index
=
ybase
+
zindex
;
real4
v
=
theta3
[
iz
];
real
inducedDipoleX
=
xscale
*
inducedDipole
[
m
*
3
];
real
inducedDipoleY
=
yscale
*
inducedDipole
[
m
*
3
+
1
];
real
inducedDipoleZ
=
zscale
*
inducedDipole
[
m
*
3
+
2
];
real
inducedDipolePolarX
=
xscale
*
inducedDipolePolar
[
m
*
3
];
real
inducedDipolePolarY
=
yscale
*
inducedDipolePolar
[
m
*
3
+
1
];
real
inducedDipolePolarZ
=
zscale
*
inducedDipolePolar
[
m
*
3
+
2
];
real
term01
=
inducedDipoleY
*
u
.
y
*
v
.
x
+
inducedDipoleZ
*
u
.
x
*
v
.
y
;
real
term01
=
inducedDipoleY
*
u
.
y
*
v
.
x
+
inducedDipoleZ
*
u
.
x
*
v
.
y
;
real
term11
=
inducedDipoleX
*
u
.
x
*
v
.
x
;
real
term11
=
inducedDipoleX
*
u
.
x
*
v
.
x
;
real
term02
=
inducedDipolePolarY
*
u
.
y
*
v
.
x
+
inducedDipolePolarZ
*
u
.
x
*
v
.
y
;
real
term02
=
inducedDipolePolarY
*
u
.
y
*
v
.
x
+
inducedDipolePolarZ
*
u
.
x
*
v
.
y
;
real
term12
=
inducedDipolePolarX
*
u
.
x
*
v
.
x
;
real
term12
=
inducedDipolePolarX
*
u
.
x
*
v
.
x
;
result
.
x
+=
term01
*
t
.
x
+
term11
*
t
.
y
;
real
add1
=
term01
*
t
.
x
+
term11
*
t
.
y
;
result
.
y
+=
term02
*
t
.
x
+
term12
*
t
.
y
;
real
add2
=
term02
*
t
.
x
+
term12
*
t
.
y
;
}
#ifdef USE_DOUBLE_PRECISION
if
(
z1
>
gridPoint
.
z
)
{
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
pmeGrid
;
gridIndex1
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
;
atomicAdd
(
&
ulonglong_p
[
2
*
index
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add1
*
0x100000000
)));
gridIndex2
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
gridPoint
.
z
;
atomicAdd
(
&
ulonglong_p
[
2
*
index
+
1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add2
*
0x100000000
)));
firstAtom
=
pmeAtomRange
[
gridIndex1
];
#else
lastAtom
=
pmeAtomRange
[
gridIndex2
+
1
];
atomicAdd
(
&
pmeGrid
[
index
].
x
,
add1
);
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
atomicAdd
(
&
pmeGrid
[
index
].
y
,
add2
);
int2
atomData
=
pmeAtomGridIndex
[
i
];
#endif
int
atomIndex
=
atomData
.
x
;
int
z
=
atomData
.
y
;
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
GRID_SIZE_Z
);
if
(
iz
>=
GRID_SIZE_Z
)
iz
-=
GRID_SIZE_Z
;
real
inducedDipoleX
=
xscale
*
inducedDipole
[
atomIndex
*
3
];
real
inducedDipoleY
=
yscale
*
inducedDipole
[
atomIndex
*
3
+
1
];
real
inducedDipoleZ
=
zscale
*
inducedDipole
[
atomIndex
*
3
+
2
];
real
inducedDipolePolarX
=
xscale
*
inducedDipolePolar
[
atomIndex
*
3
];
real
inducedDipolePolarY
=
yscale
*
inducedDipolePolar
[
atomIndex
*
3
+
1
];
real
inducedDipolePolarZ
=
zscale
*
inducedDipolePolar
[
atomIndex
*
3
+
2
];
real4
t
=
theta1
[
atomIndex
*
PME_ORDER
+
ix
];
real4
u
=
theta2
[
atomIndex
*
PME_ORDER
+
iy
];
real4
v
=
theta3
[
atomIndex
*
PME_ORDER
+
iz
];
real
term01
=
inducedDipoleY
*
u
.
y
*
v
.
x
+
inducedDipoleZ
*
u
.
x
*
v
.
y
;
real
term11
=
inducedDipoleX
*
u
.
x
*
v
.
x
;
real
term02
=
inducedDipolePolarY
*
u
.
y
*
v
.
x
+
inducedDipolePolarZ
*
u
.
x
*
v
.
y
;
real
term12
=
inducedDipolePolarX
*
u
.
x
*
v
.
x
;
result
.
x
+=
term01
*
t
.
x
+
term11
*
t
.
y
;
result
.
y
+=
term02
*
t
.
x
+
term12
*
t
.
y
;
}
}
}
}
}
}
}
pmeGrid
[
gridIndex
]
=
result
;
}
}
}
}
/**
* In double precision, we have to use fixed point to accumulate the grid values, so convert them to floating point.
*/
extern
"C"
__global__
void
finishSpreadCharge
(
long
long
*
__restrict__
pmeGrid
)
{
real
*
floatGrid
=
(
real
*
)
pmeGrid
;
const
unsigned
int
gridSize
=
2
*
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real
scale
=
1
/
(
real
)
0x100000000
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
floatGrid
[
index
]
=
scale
*
pmeGrid
[
index
];
}
extern
"C"
__global__
void
reciprocalConvolution
(
real2
*
__restrict__
pmeGrid
,
const
real
*
__restrict__
pmeBsplineModuliX
,
extern
"C"
__global__
void
reciprocalConvolution
(
real2
*
__restrict__
pmeGrid
,
const
real
*
__restrict__
pmeBsplineModuliX
,
const
real
*
__restrict__
pmeBsplineModuliY
,
const
real
*
__restrict__
pmeBsplineModuliZ
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
const
real
*
__restrict__
pmeBsplineModuliY
,
const
real
*
__restrict__
pmeBsplineModuliZ
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
...
@@ -372,12 +337,50 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
...
@@ -372,12 +337,50 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
}
}
extern
"C"
__global__
void
computeFixedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phi
,
extern
"C"
__global__
void
computeFixedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phi
,
long
long
*
__restrict__
fieldBuffers
,
long
long
*
__restrict__
fieldPolarBuffers
,
const
int4
*
__restrict__
igrid
,
const
real4
*
__restrict__
theta1
,
long
long
*
__restrict__
fieldBuffers
,
long
long
*
__restrict__
fieldPolarBuffers
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
const
real
*
__restrict__
labFrameDipole
,
real4
invPeriodicBoxSize
)
{
const
real
*
__restrict__
labFrameDipole
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
int2
*
__restrict__
pmeAtomGridIndex
)
{
// extract the permanent multipole field at each site
real
array
[
PME_ORDER
*
PME_ORDER
];
real4
theta1
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
m
=
pmeAtomGridIndex
[
i
].
x
;
real4
pos
=
posq
[
m
];
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
(
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
periodicBoxSize
.
z
;
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real
w
=
pos
.
x
*
invPeriodicBoxSize
.
x
;
real
fr
=
GRID_SIZE_X
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
int
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid1
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta1
,
w
,
array
);
w
=
pos
.
y
*
invPeriodicBoxSize
.
y
;
fr
=
GRID_SIZE_Y
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid2
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta2
,
w
,
array
);
w
=
pos
.
z
*
invPeriodicBoxSize
.
z
;
fr
=
GRID_SIZE_Z
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid3
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta3
,
w
,
array
);
igrid1
+=
(
igrid1
<
0
?
GRID_SIZE_X
:
0
);
igrid2
+=
(
igrid2
<
0
?
GRID_SIZE_Y
:
0
);
igrid3
+=
(
igrid3
<
0
?
GRID_SIZE_Z
:
0
);
// Compute the potential from this grid point.
for
(
int
m
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
m
<
NUM_ATOMS
;
m
+=
blockDim
.
x
*
gridDim
.
x
)
{
int4
gridPoint
=
igrid
[
m
];
real
tuv000
=
0
;
real
tuv000
=
0
;
real
tuv001
=
0
;
real
tuv001
=
0
;
real
tuv010
=
0
;
real
tuv010
=
0
;
...
@@ -399,8 +402,8 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
...
@@ -399,8 +402,8 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
real
tuv012
=
0
;
real
tuv012
=
0
;
real
tuv111
=
0
;
real
tuv111
=
0
;
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
k
=
grid
Point
.
z
+
iz
-
(
grid
Point
.
z
+
iz
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
int
k
=
i
grid
3
+
iz
-
(
i
grid
3
+
iz
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
real4
v
=
theta3
[
m
*
PME_ORDER
+
iz
];
real4
v
=
theta3
[
iz
];
real
tu00
=
0
;
real
tu00
=
0
;
real
tu10
=
0
;
real
tu10
=
0
;
real
tu01
=
0
;
real
tu01
=
0
;
...
@@ -412,14 +415,14 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
...
@@ -412,14 +415,14 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
real
tu12
=
0
;
real
tu12
=
0
;
real
tu03
=
0
;
real
tu03
=
0
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
j
=
grid
Point
.
y
+
iy
-
(
grid
Point
.
y
+
iy
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
int
j
=
i
grid
2
+
iy
-
(
i
grid
2
+
iy
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
real4
u
=
theta2
[
m
*
PME_ORDER
+
iy
];
real4
u
=
theta2
[
iy
];
real4
t
=
make_real4
(
0
,
0
,
0
,
0
);
real4
t
=
make_real4
(
0
,
0
,
0
,
0
);
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
i
=
grid
Point
.
x
+
ix
-
(
grid
Point
.
x
+
ix
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
int
i
=
i
grid
1
+
ix
-
(
i
grid
1
+
ix
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
int
gridIndex
=
i
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
j
*
GRID_SIZE_Z
+
k
;
int
gridIndex
=
i
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
j
*
GRID_SIZE_Z
+
k
;
real
tq
=
pmeGrid
[
gridIndex
].
x
;
real
tq
=
pmeGrid
[
gridIndex
].
x
;
real4
tadd
=
theta1
[
m
*
PME_ORDER
+
ix
];
real4
tadd
=
theta1
[
ix
];
t
.
x
+=
tq
*
tadd
.
x
;
t
.
x
+=
tq
*
tadd
.
x
;
t
.
y
+=
tq
*
tadd
.
y
;
t
.
y
+=
tq
*
tadd
.
y
;
t
.
z
+=
tq
*
tadd
.
z
;
t
.
z
+=
tq
*
tadd
.
z
;
...
@@ -491,12 +494,50 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
...
@@ -491,12 +494,50 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
}
}
extern
"C"
__global__
void
computeInducedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phid
,
extern
"C"
__global__
void
computeInducedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phid
,
real
*
__restrict__
phip
,
real
*
__restrict__
phidp
,
const
int4
*
__restrict__
igrid
,
const
real4
*
__restrict__
theta1
,
real
*
__restrict__
phip
,
real
*
__restrict__
phidp
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
real4
invPeriodicBoxSize
)
{
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
int2
*
__restrict__
pmeAtomGridIndex
)
{
// extract the induced dipole field at each site
real
array
[
PME_ORDER
*
PME_ORDER
];
real4
theta1
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
m
=
pmeAtomGridIndex
[
i
].
x
;
real4
pos
=
posq
[
m
];
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
(
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
periodicBoxSize
.
z
;
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real
w
=
pos
.
x
*
invPeriodicBoxSize
.
x
;
real
fr
=
GRID_SIZE_X
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
int
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid1
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta1
,
w
,
array
);
w
=
pos
.
y
*
invPeriodicBoxSize
.
y
;
fr
=
GRID_SIZE_Y
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid2
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta2
,
w
,
array
);
w
=
pos
.
z
*
invPeriodicBoxSize
.
z
;
fr
=
GRID_SIZE_Z
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
w
=
fr
-
ifr
;
int
igrid3
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta3
,
w
,
array
);
igrid1
+=
(
igrid1
<
0
?
GRID_SIZE_X
:
0
);
igrid2
+=
(
igrid2
<
0
?
GRID_SIZE_Y
:
0
);
igrid3
+=
(
igrid3
<
0
?
GRID_SIZE_Z
:
0
);
// Compute the potential from this grid point.
for
(
int
m
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
m
<
NUM_ATOMS
;
m
+=
blockDim
.
x
*
gridDim
.
x
)
{
int4
gridPoint
=
igrid
[
m
];
real
tuv100_1
=
0
;
real
tuv100_1
=
0
;
real
tuv010_1
=
0
;
real
tuv010_1
=
0
;
real
tuv001_1
=
0
;
real
tuv001_1
=
0
;
...
@@ -536,8 +577,8 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
...
@@ -536,8 +577,8 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real
tuv012
=
0
;
real
tuv012
=
0
;
real
tuv111
=
0
;
real
tuv111
=
0
;
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
k
=
grid
Point
.
z
+
iz
-
(
grid
Point
.
z
+
iz
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
int
k
=
i
grid
3
+
iz
-
(
i
grid
3
+
iz
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
real4
v
=
theta3
[
m
*
PME_ORDER
+
iz
];
real4
v
=
theta3
[
iz
];
real
tu00_1
=
0
;
real
tu00_1
=
0
;
real
tu01_1
=
0
;
real
tu01_1
=
0
;
real
tu10_1
=
0
;
real
tu10_1
=
0
;
...
@@ -561,8 +602,8 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
...
@@ -561,8 +602,8 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real
tu12
=
0
;
real
tu12
=
0
;
real
tu03
=
0
;
real
tu03
=
0
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
j
=
grid
Point
.
y
+
iy
-
(
grid
Point
.
y
+
iy
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
int
j
=
i
grid
2
+
iy
-
(
i
grid
2
+
iy
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
real4
u
=
theta2
[
m
*
PME_ORDER
+
iy
];
real4
u
=
theta2
[
iy
];
real
t0_1
=
0
;
real
t0_1
=
0
;
real
t1_1
=
0
;
real
t1_1
=
0
;
real
t2_1
=
0
;
real
t2_1
=
0
;
...
@@ -571,10 +612,10 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
...
@@ -571,10 +612,10 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real
t2_2
=
0
;
real
t2_2
=
0
;
real
t3
=
0
;
real
t3
=
0
;
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
i
=
grid
Point
.
x
+
ix
-
(
grid
Point
.
x
+
ix
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
int
i
=
i
grid
1
+
ix
-
(
i
grid
1
+
ix
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
int
gridIndex
=
i
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
j
*
GRID_SIZE_Z
+
k
;
int
gridIndex
=
i
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
j
*
GRID_SIZE_Z
+
k
;
real2
tq
=
pmeGrid
[
gridIndex
];
real2
tq
=
pmeGrid
[
gridIndex
];
real4
tadd
=
theta1
[
m
*
PME_ORDER
+
ix
];
real4
tadd
=
theta1
[
ix
];
t0_1
+=
tq
.
x
*
tadd
.
x
;
t0_1
+=
tq
.
x
*
tadd
.
x
;
t1_1
+=
tq
.
x
*
tadd
.
y
;
t1_1
+=
tq
.
x
*
tadd
.
y
;
t2_1
+=
tq
.
x
*
tadd
.
z
;
t2_1
+=
tq
.
x
*
tadd
.
z
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/pmeMultipoleElectrostatics.cu
View file @
93c467b2
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef
struct
{
typedef
struct
{
...
@@ -182,253 +181,223 @@ __device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy) {
...
@@ -182,253 +181,223 @@ __device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy) {
*/
*/
extern
"C"
__global__
void
computeElectrostatics
(
extern
"C"
__global__
void
computeElectrostatics
(
unsigned
long
long
*
__restrict__
forceBuffers
,
unsigned
long
long
*
__restrict__
torqueBuffers
,
real
*
__restrict__
energyBuffer
,
unsigned
long
long
*
__restrict__
forceBuffers
,
unsigned
long
long
*
__restrict__
torqueBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
u
nsigned
int
*
__restrict__
exclusionIndice
s
,
const
unsigned
int
*
__restrict__
exclusionRowIndice
s
,
const
real4
*
__restrict__
posq
,
const
uint
2
*
__restrict__
covalentFlag
s
,
const
unsigned
int
*
__restrict__
polarizationGroupFlag
s
,
const
u
in
t2
*
__restrict__
covalentFlags
,
const
unsigned
int
*
__restrict__
polarizationGroupFlag
s
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
const
u
shor
t2
*
__restrict__
exclusionTile
s
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
,
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
,
#endif
#endif
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
const
float2
*
__restrict__
dampingAndThole
)
{
const
real
*
__restrict__
inducedDipolePolar
,
const
float2
*
__restrict__
dampingAndThole
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
real
energy
=
0
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
AtomData
data
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
loadAtomData
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
data
.
force
=
make_real3
(
0
);
data
.
torque
=
make_real3
(
0
);
uint2
covalent
=
covalentFlags
[
pos
*
TILE_SIZE
+
tgx
];
unsigned
int
polarizationGroup
=
polarizationGroupFlags
[
pos
*
TILE_SIZE
+
tgx
];
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
pos
=
data
.
pos
;
localData
[
threadIdx
.
x
].
q
=
data
.
q
;
localData
[
threadIdx
.
x
].
dipole
=
data
.
dipole
;
localData
[
threadIdx
.
x
].
quadrupoleXX
=
data
.
quadrupoleXX
;
localData
[
threadIdx
.
x
].
quadrupoleXY
=
data
.
quadrupoleXY
;
localData
[
threadIdx
.
x
].
quadrupoleXZ
=
data
.
quadrupoleXZ
;
localData
[
threadIdx
.
x
].
quadrupoleYY
=
data
.
quadrupoleYY
;
localData
[
threadIdx
.
x
].
quadrupoleYZ
=
data
.
quadrupoleYZ
;
localData
[
threadIdx
.
x
].
inducedDipole
=
data
.
inducedDipole
;
localData
[
threadIdx
.
x
].
inducedDipolePolar
=
data
.
inducedDipolePolar
;
localData
[
threadIdx
.
x
].
thole
=
data
.
thole
;
localData
[
threadIdx
.
x
].
damp
=
data
.
damp
;
// Compute forces.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
float
m
=
computeMScaleFactor
(
covalent
,
j
);
computeOneInteraction
(
data
,
localData
[
tbx
+
j
],
true
,
d
,
p
,
m
,
0.5
f
,
energy
,
periodicBoxSize
,
invPeriodicBoxSize
);
}
}
if
(
atom1
<
NUM_ATOMS
)
computeSelfEnergyAndTorque
(
data
,
energy
);
data
.
force
*=
-
ENERGY_SCALE_FACTOR
;
data
.
torque
*=
ENERGY_SCALE_FACTOR
;
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
z
*
0x100000000
)));
}
else
{
// This is an off-diagonal tile.
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
loadAtomData
(
localData
[
threadIdx
.
x
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
localData
[
threadIdx
.
x
].
torque
=
make_real3
(
0
);
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
float
m
=
computeMScaleFactor
(
covalent
,
tj
);
computeOneInteraction
(
data
,
localData
[
tbx
+
tj
],
true
,
d
,
p
,
m
,
1
,
energy
,
periodicBoxSize
,
invPeriodicBoxSize
);
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
-
ENERGY_SCALE_FACTOR
;
data
.
torque
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
-
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
torque
*=
ENERGY_SCALE_FACTOR
;
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
z
*
0x100000000
)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
const
unsigned
int
numTiles
=
interactionCount
[
0
];
const
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+
warp
*
numTileIndices
/
totalWarps
:
warp
*
numTiles
/
totalWarps
);
unsigned
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
warp
+
1
)
*
numTileIndices
/
totalWarps
:
(
warp
+
1
)
*
numTiles
/
totalWarps
);
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
warp
+
1
)
*
numTileIndices
/
totalWarps
:
(
warp
+
1
)
*
numTiles
/
totalWarps
);
#else
#else
const
unsigned
int
numTiles
=
numTileIndices
;
const
unsigned
int
numTiles
=
numTileIndices
;
unsigned
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
real
energy
=
0
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
#ifndef ENABLE_SHUFFLE
__shared__
real
tempBuffer
[
3
*
THREAD_BLOCK_SIZE
];
#endif
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
do
{
while
(
pos
<
end
)
{
// Extract the coordinates of this tile
bool
includeTile
=
true
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
// Extract the coordinates of this tile.
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
unsigned
int
x
,
y
;
AtomData
data
;
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
}
else
else
#endif
#endif
{
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
}
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
AtomData
data
;
loadAtomData
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
loadAtomData
(
data
,
atom1
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
data
.
force
=
make_real3
(
0
);
data
.
force
=
make_real3
(
0
);
data
.
torque
=
make_real3
(
0
);
data
.
torque
=
make_real3
(
0
);
// Locate the exclusion data for this tile.
if
(
tgx
<
2
)
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex
[
localGroupIndex
]
>
-
1
);
if
(
pos
>=
end
)
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
pos
=
data
.
pos
;
localData
[
threadIdx
.
x
].
q
=
data
.
q
;
localData
[
threadIdx
.
x
].
dipole
=
data
.
dipole
;
localData
[
threadIdx
.
x
].
quadrupoleXX
=
data
.
quadrupoleXX
;
localData
[
threadIdx
.
x
].
quadrupoleXY
=
data
.
quadrupoleXY
;
localData
[
threadIdx
.
x
].
quadrupoleXZ
=
data
.
quadrupoleXZ
;
localData
[
threadIdx
.
x
].
quadrupoleYY
=
data
.
quadrupoleYY
;
localData
[
threadIdx
.
x
].
quadrupoleYZ
=
data
.
quadrupoleYZ
;
localData
[
threadIdx
.
x
].
inducedDipole
=
data
.
inducedDipole
;
localData
[
threadIdx
.
x
].
inducedDipolePolar
=
data
.
inducedDipolePolar
;
localData
[
threadIdx
.
x
].
thole
=
data
.
thole
;
localData
[
threadIdx
.
x
].
damp
=
data
.
damp
;
uint2
covalent
=
covalentFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
unsigned
int
polarizationGroup
=
polarizationGroupFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
// Compute forces.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
float
d
=
computeDScaleFactor
(
polarizationGroup
,
j
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
j
);
float
m
=
computeMScaleFactor
(
covalent
,
j
);
computeOneInteraction
(
data
,
localData
[
tbx
+
j
],
hasExclusions
,
d
,
p
,
m
,
0.5
f
,
energy
,
periodicBoxSize
,
invPeriodicBoxSize
);
}
}
if
(
atom1
<
NUM_ATOMS
)
computeSelfEnergyAndTorque
(
data
,
energy
);
data
.
force
*=
-
ENERGY_SCALE_FACTOR
;
data
.
torque
*=
ENERGY_SCALE_FACTOR
;
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
z
*
0x100000000
)));
}
else
{
// This is an off-diagonal tile.
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
loadAtomData
(
localData
[
threadIdx
.
x
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
localData
[
threadIdx
.
x
].
torque
=
make_real3
(
0
);
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
if
(
!
hasExclusions
&&
flags
==
0
)
{
// TODO: Why doesn't the flags != 0 block work?
// if (!hasExclusions && flags != 0xFFFFFFFF) {
if
(
flags
==
0
)
{
// No interactions in this tile.
}
else
{
// Compute only a subset of the interactions in this tile.
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
int
atom2
=
tbx
+
j
;
real3
oldForce
=
localData
[
atom2
].
force
;
real3
oldTorque
=
localData
[
atom2
].
torque
;
localData
[
atom2
].
force
=
make_real3
(
0
);
localData
[
atom2
].
torque
=
make_real3
(
0
);
computeOneInteraction
(
data
,
localData
[
tbx
+
j
],
false
,
1
,
1
,
1
,
1
,
energy
,
periodicBoxSize
,
invPeriodicBoxSize
);
real3
newForce
=
localData
[
atom2
].
force
;
real3
newTorque
=
localData
[
atom2
].
torque
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#ifdef ENABLE_SHUFFLE
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
{
newForce
.
x
+=
__shfl_xor
(
newForce
.
x
,
i
,
32
);
newForce
.
y
+=
__shfl_xor
(
newForce
.
y
,
i
,
32
);
newForce
.
z
+=
__shfl_xor
(
newForce
.
z
,
i
,
32
);
newTorque
.
x
+=
__shfl_xor
(
newTorque
.
x
,
i
,
32
);
newTorque
.
y
+=
__shfl_xor
(
newTorque
.
y
,
i
,
32
);
newTorque
.
z
+=
__shfl_xor
(
newTorque
.
z
,
i
,
32
);
}
if
(
tgx
==
0
)
{
localData
[
atom2
].
force
-=
newForce
;
localData
[
atom2
].
torque
-=
newTorque
;
}
#else
#else
int
bufferIndex
=
3
*
threadIdx
.
x
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
tempBuffer
[
bufferIndex
]
=
newForce
.
x
;
tempBuffer
[
bufferIndex
+
1
]
=
newForce
.
y
;
tempBuffer
[
bufferIndex
+
2
]
=
newForce
.
z
;
if
(
tgx
%
4
==
0
)
{
tempBuffer
[
bufferIndex
]
+=
tempBuffer
[
bufferIndex
+
3
]
+
tempBuffer
[
bufferIndex
+
6
]
+
tempBuffer
[
bufferIndex
+
9
];
tempBuffer
[
bufferIndex
+
1
]
+=
tempBuffer
[
bufferIndex
+
4
]
+
tempBuffer
[
bufferIndex
+
7
]
+
tempBuffer
[
bufferIndex
+
10
];
tempBuffer
[
bufferIndex
+
2
]
+=
tempBuffer
[
bufferIndex
+
5
]
+
tempBuffer
[
bufferIndex
+
8
]
+
tempBuffer
[
bufferIndex
+
11
];
}
if
(
tgx
==
0
)
{
localData
[
atom2
].
force
.
x
-=
tempBuffer
[
bufferIndex
]
+
tempBuffer
[
bufferIndex
+
12
]
+
tempBuffer
[
bufferIndex
+
24
]
+
tempBuffer
[
bufferIndex
+
36
]
+
tempBuffer
[
bufferIndex
+
48
]
+
tempBuffer
[
bufferIndex
+
60
]
+
tempBuffer
[
bufferIndex
+
72
]
+
tempBuffer
[
bufferIndex
+
84
];
localData
[
atom2
].
force
.
y
-=
tempBuffer
[
bufferIndex
+
1
]
+
tempBuffer
[
bufferIndex
+
13
]
+
tempBuffer
[
bufferIndex
+
25
]
+
tempBuffer
[
bufferIndex
+
37
]
+
tempBuffer
[
bufferIndex
+
49
]
+
tempBuffer
[
bufferIndex
+
61
]
+
tempBuffer
[
bufferIndex
+
73
]
+
tempBuffer
[
bufferIndex
+
85
];
localData
[
atom2
].
force
.
z
-=
tempBuffer
[
bufferIndex
+
2
]
+
tempBuffer
[
bufferIndex
+
14
]
+
tempBuffer
[
bufferIndex
+
26
]
+
tempBuffer
[
bufferIndex
+
38
]
+
tempBuffer
[
bufferIndex
+
50
]
+
tempBuffer
[
bufferIndex
+
62
]
+
tempBuffer
[
bufferIndex
+
74
]
+
tempBuffer
[
bufferIndex
+
86
];
}
tempBuffer
[
bufferIndex
]
=
newTorque
.
x
;
tempBuffer
[
bufferIndex
+
1
]
=
newTorque
.
y
;
tempBuffer
[
bufferIndex
+
2
]
=
newTorque
.
z
;
if
(
tgx
%
4
==
0
)
{
tempBuffer
[
bufferIndex
]
+=
tempBuffer
[
bufferIndex
+
3
]
+
tempBuffer
[
bufferIndex
+
6
]
+
tempBuffer
[
bufferIndex
+
9
];
tempBuffer
[
bufferIndex
+
1
]
+=
tempBuffer
[
bufferIndex
+
4
]
+
tempBuffer
[
bufferIndex
+
7
]
+
tempBuffer
[
bufferIndex
+
10
];
tempBuffer
[
bufferIndex
+
2
]
+=
tempBuffer
[
bufferIndex
+
5
]
+
tempBuffer
[
bufferIndex
+
8
]
+
tempBuffer
[
bufferIndex
+
11
];
}
if
(
tgx
==
0
)
{
localData
[
atom2
].
torque
.
x
-=
tempBuffer
[
bufferIndex
]
+
tempBuffer
[
bufferIndex
+
12
]
+
tempBuffer
[
bufferIndex
+
24
]
+
tempBuffer
[
bufferIndex
+
36
]
+
tempBuffer
[
bufferIndex
+
48
]
+
tempBuffer
[
bufferIndex
+
60
]
+
tempBuffer
[
bufferIndex
+
72
]
+
tempBuffer
[
bufferIndex
+
84
];
localData
[
atom2
].
torque
.
y
-=
tempBuffer
[
bufferIndex
+
1
]
+
tempBuffer
[
bufferIndex
+
13
]
+
tempBuffer
[
bufferIndex
+
25
]
+
tempBuffer
[
bufferIndex
+
37
]
+
tempBuffer
[
bufferIndex
+
49
]
+
tempBuffer
[
bufferIndex
+
61
]
+
tempBuffer
[
bufferIndex
+
73
]
+
tempBuffer
[
bufferIndex
+
85
];
localData
[
atom2
].
torque
.
z
-=
tempBuffer
[
bufferIndex
+
2
]
+
tempBuffer
[
bufferIndex
+
14
]
+
tempBuffer
[
bufferIndex
+
26
]
+
tempBuffer
[
bufferIndex
+
38
]
+
tempBuffer
[
bufferIndex
+
50
]
+
tempBuffer
[
bufferIndex
+
62
]
+
tempBuffer
[
bufferIndex
+
74
]
+
tempBuffer
[
bufferIndex
+
86
];
}
#endif
}
}
}
data
.
force
*=
-
ENERGY_SCALE_FACTOR
;
data
.
torque
*=
-
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
-
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
torque
*=
-
ENERGY_SCALE_FACTOR
;
if
(
pos
<
end
)
{
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
z
*
0x100000000
)));
}
}
}
else
#endif
#endif
{
atomIndices
[
threadIdx
.
x
]
=
j
;
// Compute the full set of interactions in this tile.
loadAtomData
(
localData
[
threadIdx
.
x
],
j
,
posq
,
labFrameDipole
,
labFrameQuadrupole
,
inducedDipole
,
inducedDipolePolar
,
dampingAndThole
);
localData
[
threadIdx
.
x
].
force
=
make_real3
(
0
);
uint2
covalent
=
(
hasExclusions
?
covalentFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
make_uint2
(
0
,
0
));
localData
[
threadIdx
.
x
].
torque
=
make_real3
(
0
);
unsigned
int
polarizationGroup
=
(
hasExclusions
?
polarizationGroupFlags
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
0
);
// Compute forces.
// Compute forces.
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
atomIndices
[
tbx
+
tj
];
int
atom2
=
y
*
TILE_SIZE
+
tj
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
computeOneInteraction
(
data
,
localData
[
tbx
+
tj
],
false
,
1
,
1
,
1
,
1
,
energy
,
periodicBoxSize
,
invPeriodicBoxSize
);
float
d
=
computeDScaleFactor
(
polarizationGroup
,
tj
);
float
p
=
computePScaleFactor
(
covalent
,
polarizationGroup
,
tj
);
float
m
=
computeMScaleFactor
(
covalent
,
tj
);
computeOneInteraction
(
data
,
localData
[
tbx
+
tj
],
hasExclusions
,
d
,
p
,
m
,
1
,
energy
,
periodicBoxSize
,
invPeriodicBoxSize
);
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
data
.
force
*=
-
ENERGY_SCALE_FACTOR
;
data
.
torque
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
-
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
torque
*=
ENERGY_SCALE_FACTOR
;
if
(
pos
<
end
)
{
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
z
*
0x100000000
)));
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
z
*
0x100000000
)));
}
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
data
.
force
*=
-
ENERGY_SCALE_FACTOR
;
data
.
torque
*=
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
force
*=
-
ENERGY_SCALE_FACTOR
;
localData
[
threadIdx
.
x
].
torque
*=
ENERGY_SCALE_FACTOR
;
// Write results.
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
data
.
torque
.
z
*
0x100000000
)));
#ifdef USE_CUTOFF
offset
=
atomIndices
[
threadIdx
.
x
];
#else
offset
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
x
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
y
*
0x100000000
)));
atomicAdd
(
&
torqueBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
torque
.
z
*
0x100000000
)));
}
}
pos
++
;
pos
++
;
}
while
(
pos
<
end
);
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
*
ENERGY_SCALE_FACTOR
;
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
*
ENERGY_SCALE_FACTOR
;
}
}
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment