Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
b591d011
Commit
b591d011
authored
Aug 21, 2015
by
Peter Eastman
Browse files
Optimizations to AMOEBA PME
parent
9da84a77
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
128 additions
and
129 deletions
+128
-129
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
+128
-129
No files found.
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
View file @
b591d011
...
...
@@ -196,14 +196,14 @@ extern "C" __global__ void transformPotentialToCartesianCoordinates(const real*
// Transform the potential.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
cphi
[
10
*
i
]
=
fphi
[
20
*
i
];
cphi
[
10
*
i
+
1
]
=
a
[
0
][
0
]
*
fphi
[
20
*
i
+
1
]
+
a
[
0
][
1
]
*
fphi
[
20
*
i
+
2
]
+
a
[
0
][
2
]
*
fphi
[
20
*
i
+
3
];
cphi
[
10
*
i
+
2
]
=
a
[
1
][
0
]
*
fphi
[
20
*
i
+
1
]
+
a
[
1
][
1
]
*
fphi
[
20
*
i
+
2
]
+
a
[
1
][
2
]
*
fphi
[
20
*
i
+
3
];
cphi
[
10
*
i
+
3
]
=
a
[
2
][
0
]
*
fphi
[
20
*
i
+
1
]
+
a
[
2
][
1
]
*
fphi
[
20
*
i
+
2
]
+
a
[
2
][
2
]
*
fphi
[
20
*
i
+
3
];
cphi
[
10
*
i
]
=
fphi
[
i
];
cphi
[
10
*
i
+
1
]
=
a
[
0
][
0
]
*
fphi
[
i
+
NUM_ATOMS
*
1
]
+
a
[
0
][
1
]
*
fphi
[
i
+
NUM_ATOMS
*
2
]
+
a
[
0
][
2
]
*
fphi
[
i
+
NUM_ATOMS
*
3
];
cphi
[
10
*
i
+
2
]
=
a
[
1
][
0
]
*
fphi
[
i
+
NUM_ATOMS
*
1
]
+
a
[
1
][
1
]
*
fphi
[
i
+
NUM_ATOMS
*
2
]
+
a
[
1
][
2
]
*
fphi
[
i
+
NUM_ATOMS
*
3
];
cphi
[
10
*
i
+
3
]
=
a
[
2
][
0
]
*
fphi
[
i
+
NUM_ATOMS
*
1
]
+
a
[
2
][
1
]
*
fphi
[
i
+
NUM_ATOMS
*
2
]
+
a
[
2
][
2
]
*
fphi
[
i
+
NUM_ATOMS
*
3
];
for
(
int
j
=
0
;
j
<
6
;
j
++
)
{
cphi
[
10
*
i
+
4
+
j
]
=
0
;
for
(
int
k
=
0
;
k
<
6
;
k
++
)
cphi
[
10
*
i
+
4
+
j
]
+=
b
[
j
][
k
]
*
fphi
[
20
*
i
+
4
+
k
];
cphi
[
10
*
i
+
4
+
j
]
+=
b
[
j
][
k
]
*
fphi
[
i
+
NUM_ATOMS
*
(
4
+
k
)
];
}
}
}
...
...
@@ -211,20 +211,29 @@ extern "C" __global__ void transformPotentialToCartesianCoordinates(const real*
extern
"C"
__global__
void
gridSpreadFixedMultipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
fracDipole
,
const
real
*
__restrict__
fracQuadrupole
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real
array
[
PME_ORDER
*
PME_ORDER
];
// The workspace array doesn't really need to be shared, but we have shared memory to spare, and putting it there
// reduces the load on L2 cache.
__shared__
real
sharedArray
[
PME_ORDER
*
PME_ORDER
*
64
];
real
*
array
=
&
sharedArray
[
PME_ORDER
*
PME_ORDER
*
threadIdx
.
x
];
real4
theta1
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
m
=
pmeAtomGridIndex
[
i
].
x
;
for
(
int
m
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
m
<
NUM_ATOMS
;
m
+=
blockDim
.
x
*
gridDim
.
x
)
{
real4
pos
=
posq
[
m
];
pos
-=
periodicBoxVecZ
*
floor
(
pos
.
z
*
recipBoxVecZ
.
z
+
0.5
f
);
pos
-=
periodicBoxVecY
*
floor
(
pos
.
y
*
recipBoxVecY
.
z
+
0.5
f
);
pos
-=
periodicBoxVecX
*
floor
(
pos
.
x
*
recipBoxVecX
.
z
+
0.5
f
);
real
atomCharge
=
pos
.
w
;
real
atomDipoleX
=
fracDipole
[
m
*
3
];
real
atomDipoleY
=
fracDipole
[
m
*
3
+
1
];
real
atomDipoleZ
=
fracDipole
[
m
*
3
+
2
];
real
atomQuadrupoleXX
=
fracQuadrupole
[
m
*
6
];
real
atomQuadrupoleXY
=
fracQuadrupole
[
m
*
6
+
1
];
real
atomQuadrupoleXZ
=
fracQuadrupole
[
m
*
6
+
2
];
real
atomQuadrupoleYY
=
fracQuadrupole
[
m
*
6
+
3
];
real
atomQuadrupoleYZ
=
fracQuadrupole
[
m
*
6
+
4
];
real
atomQuadrupoleZZ
=
fracQuadrupole
[
m
*
6
+
5
];
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
...
...
@@ -271,16 +280,6 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
int
index
=
ybase
+
zindex
;
real4
v
=
theta3
[
iz
];
real
atomCharge
=
pos
.
w
;
real
atomDipoleX
=
fracDipole
[
m
*
3
];
real
atomDipoleY
=
fracDipole
[
m
*
3
+
1
];
real
atomDipoleZ
=
fracDipole
[
m
*
3
+
2
];
real
atomQuadrupoleXX
=
fracQuadrupole
[
m
*
6
];
real
atomQuadrupoleXY
=
fracQuadrupole
[
m
*
6
+
1
];
real
atomQuadrupoleXZ
=
fracQuadrupole
[
m
*
6
+
2
];
real
atomQuadrupoleYY
=
fracQuadrupole
[
m
*
6
+
3
];
real
atomQuadrupoleYZ
=
fracQuadrupole
[
m
*
6
+
4
];
real
atomQuadrupoleZZ
=
fracQuadrupole
[
m
*
6
+
5
];
real
term0
=
atomCharge
*
u
.
x
*
v
.
x
+
atomDipoleY
*
u
.
y
*
v
.
x
+
atomDipoleZ
*
u
.
x
*
v
.
y
+
atomQuadrupoleYY
*
u
.
z
*
v
.
x
+
atomQuadrupoleZZ
*
u
.
x
*
v
.
z
+
atomQuadrupoleYZ
*
u
.
y
*
v
.
y
;
real
term1
=
atomDipoleX
*
u
.
x
*
v
.
x
+
atomQuadrupoleXY
*
u
.
y
*
v
.
x
+
atomQuadrupoleXZ
*
u
.
x
*
v
.
y
;
real
term2
=
atomQuadrupoleXX
*
u
.
x
*
v
.
x
;
...
...
@@ -300,7 +299,10 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
extern
"C"
__global__
void
gridSpreadInducedDipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real
array
[
PME_ORDER
*
PME_ORDER
];
// The workspace array doesn't really need to be shared, but we have shared memory to spare, and putting it there
// reduces the load on L2 cache.
__shared__
real
sharedArray
[
PME_ORDER
*
PME_ORDER
*
64
];
real
*
array
=
&
sharedArray
[
PME_ORDER
*
PME_ORDER
*
threadIdx
.
x
];
real4
theta1
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
...
...
@@ -318,15 +320,19 @@ extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ po
}
__syncthreads
();
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
m
=
pmeAtomGridIndex
[
i
].
x
;
for
(
int
m
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
m
<
NUM_ATOMS
;
m
+=
blockDim
.
x
*
gridDim
.
x
)
{
real4
pos
=
posq
[
m
];
pos
-=
periodicBoxVecZ
*
floor
(
pos
.
z
*
recipBoxVecZ
.
z
+
0.5
f
);
pos
-=
periodicBoxVecY
*
floor
(
pos
.
y
*
recipBoxVecY
.
z
+
0.5
f
);
pos
-=
periodicBoxVecX
*
floor
(
pos
.
x
*
recipBoxVecX
.
z
+
0.5
f
);
real3
cinducedDipole
=
((
const
real3
*
)
inducedDipole
)[
m
];
real3
cinducedDipolePolar
=
((
const
real3
*
)
inducedDipolePolar
)[
m
];
real3
finducedDipole
=
make_real3
(
cinducedDipole
.
x
*
cartToFrac
[
0
][
0
]
+
cinducedDipole
.
y
*
cartToFrac
[
0
][
1
]
+
cinducedDipole
.
z
*
cartToFrac
[
0
][
2
],
cinducedDipole
.
x
*
cartToFrac
[
1
][
0
]
+
cinducedDipole
.
y
*
cartToFrac
[
1
][
1
]
+
cinducedDipole
.
z
*
cartToFrac
[
1
][
2
],
cinducedDipole
.
x
*
cartToFrac
[
2
][
0
]
+
cinducedDipole
.
y
*
cartToFrac
[
2
][
1
]
+
cinducedDipole
.
z
*
cartToFrac
[
2
][
2
]);
real3
finducedDipolePolar
=
make_real3
(
cinducedDipolePolar
.
x
*
cartToFrac
[
0
][
0
]
+
cinducedDipolePolar
.
y
*
cartToFrac
[
0
][
1
]
+
cinducedDipolePolar
.
z
*
cartToFrac
[
0
][
2
],
cinducedDipolePolar
.
x
*
cartToFrac
[
1
][
0
]
+
cinducedDipolePolar
.
y
*
cartToFrac
[
1
][
1
]
+
cinducedDipolePolar
.
z
*
cartToFrac
[
1
][
2
],
cinducedDipolePolar
.
x
*
cartToFrac
[
2
][
0
]
+
cinducedDipolePolar
.
y
*
cartToFrac
[
2
][
1
]
+
cinducedDipolePolar
.
z
*
cartToFrac
[
2
][
2
]);
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
...
...
@@ -373,14 +379,6 @@ extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ po
int
index
=
ybase
+
zindex
;
real4
v
=
theta3
[
iz
];
real3
cinducedDipole
=
make_real3
(
inducedDipole
[
m
*
3
],
inducedDipole
[
m
*
3
+
1
],
inducedDipole
[
m
*
3
+
2
]);
real3
cinducedDipolePolar
=
make_real3
(
inducedDipolePolar
[
m
*
3
],
inducedDipolePolar
[
m
*
3
+
1
],
inducedDipolePolar
[
m
*
3
+
2
]);
real3
finducedDipole
=
make_real3
(
cinducedDipole
.
x
*
cartToFrac
[
0
][
0
]
+
cinducedDipole
.
y
*
cartToFrac
[
0
][
1
]
+
cinducedDipole
.
z
*
cartToFrac
[
0
][
2
],
cinducedDipole
.
x
*
cartToFrac
[
1
][
0
]
+
cinducedDipole
.
y
*
cartToFrac
[
1
][
1
]
+
cinducedDipole
.
z
*
cartToFrac
[
1
][
2
],
cinducedDipole
.
x
*
cartToFrac
[
2
][
0
]
+
cinducedDipole
.
y
*
cartToFrac
[
2
][
1
]
+
cinducedDipole
.
z
*
cartToFrac
[
2
][
2
]);
real3
finducedDipolePolar
=
make_real3
(
cinducedDipolePolar
.
x
*
cartToFrac
[
0
][
0
]
+
cinducedDipolePolar
.
y
*
cartToFrac
[
0
][
1
]
+
cinducedDipolePolar
.
z
*
cartToFrac
[
0
][
2
],
cinducedDipolePolar
.
x
*
cartToFrac
[
1
][
0
]
+
cinducedDipolePolar
.
y
*
cartToFrac
[
1
][
1
]
+
cinducedDipolePolar
.
z
*
cartToFrac
[
1
][
2
],
cinducedDipolePolar
.
x
*
cartToFrac
[
2
][
0
]
+
cinducedDipolePolar
.
y
*
cartToFrac
[
2
][
1
]
+
cinducedDipolePolar
.
z
*
cartToFrac
[
2
][
2
]);
real
term01
=
finducedDipole
.
y
*
u
.
y
*
v
.
x
+
finducedDipole
.
z
*
u
.
x
*
v
.
y
;
real
term11
=
finducedDipole
.
x
*
u
.
x
*
v
.
x
;
real
term02
=
finducedDipolePolar
.
y
*
u
.
y
*
v
.
x
+
finducedDipolePolar
.
z
*
u
.
x
*
v
.
y
;
...
...
@@ -448,7 +446,10 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
long
long
*
__restrict__
fieldBuffers
,
long
long
*
__restrict__
fieldPolarBuffers
,
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
labFrameDipole
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
int2
*
__restrict__
pmeAtomGridIndex
)
{
real
array
[
PME_ORDER
*
PME_ORDER
];
// The workspace array doesn't really need to be shared, but we have shared memory to spare, and putting it there
// reduces the load on L2 cache.
__shared__
real
sharedArray
[
PME_ORDER
*
PME_ORDER
*
64
];
real
*
array
=
&
sharedArray
[
PME_ORDER
*
PME_ORDER
*
threadIdx
.
x
];
real4
theta1
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
...
...
@@ -582,26 +583,26 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
tuv012
+=
tu01
*
v
.
z
;
tuv111
+=
tu11
*
v
.
y
;
}
phi
[
20
*
m
]
=
tuv000
;
phi
[
20
*
m
+
1
]
=
tuv100
;
phi
[
20
*
m
+
2
]
=
tuv010
;
phi
[
20
*
m
+
3
]
=
tuv001
;
phi
[
20
*
m
+
4
]
=
tuv200
;
phi
[
20
*
m
+
5
]
=
tuv020
;
phi
[
20
*
m
+
6
]
=
tuv002
;
phi
[
20
*
m
+
7
]
=
tuv110
;
phi
[
20
*
m
+
8
]
=
tuv101
;
phi
[
20
*
m
+
9
]
=
tuv011
;
phi
[
20
*
m
+
10
]
=
tuv300
;
phi
[
20
*
m
+
11
]
=
tuv030
;
phi
[
20
*
m
+
12
]
=
tuv003
;
phi
[
20
*
m
+
13
]
=
tuv210
;
phi
[
20
*
m
+
14
]
=
tuv201
;
phi
[
20
*
m
+
15
]
=
tuv120
;
phi
[
20
*
m
+
16
]
=
tuv021
;
phi
[
20
*
m
+
17
]
=
tuv102
;
phi
[
20
*
m
+
18
]
=
tuv012
;
phi
[
20
*
m
+
19
]
=
tuv111
;
phi
[
m
]
=
tuv000
;
phi
[
m
+
NUM_ATOMS
]
=
tuv100
;
phi
[
m
+
NUM_ATOMS
*
2
]
=
tuv010
;
phi
[
m
+
NUM_ATOMS
*
3
]
=
tuv001
;
phi
[
m
+
NUM_ATOMS
*
4
]
=
tuv200
;
phi
[
m
+
NUM_ATOMS
*
5
]
=
tuv020
;
phi
[
m
+
NUM_ATOMS
*
6
]
=
tuv002
;
phi
[
m
+
NUM_ATOMS
*
7
]
=
tuv110
;
phi
[
m
+
NUM_ATOMS
*
8
]
=
tuv101
;
phi
[
m
+
NUM_ATOMS
*
9
]
=
tuv011
;
phi
[
m
+
NUM_ATOMS
*
10
]
=
tuv300
;
phi
[
m
+
NUM_ATOMS
*
11
]
=
tuv030
;
phi
[
m
+
NUM_ATOMS
*
12
]
=
tuv003
;
phi
[
m
+
NUM_ATOMS
*
13
]
=
tuv210
;
phi
[
m
+
NUM_ATOMS
*
14
]
=
tuv201
;
phi
[
m
+
NUM_ATOMS
*
15
]
=
tuv120
;
phi
[
m
+
NUM_ATOMS
*
16
]
=
tuv021
;
phi
[
m
+
NUM_ATOMS
*
17
]
=
tuv102
;
phi
[
m
+
NUM_ATOMS
*
18
]
=
tuv012
;
phi
[
m
+
NUM_ATOMS
*
19
]
=
tuv111
;
real
dipoleScale
=
(
4
/
(
real
)
3
)
*
(
EWALD_ALPHA
*
EWALD_ALPHA
*
EWALD_ALPHA
)
/
SQRT_PI
;
long
long
fieldx
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
]
-
tuv100
*
fracToCart
[
0
][
0
]
-
tuv010
*
fracToCart
[
0
][
1
]
-
tuv001
*
fracToCart
[
0
][
2
])
*
0x100000000
);
fieldBuffers
[
m
]
=
fieldx
;
...
...
@@ -619,7 +620,10 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real
*
__restrict__
phip
,
real
*
__restrict__
phidp
,
const
real4
*
__restrict__
posq
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
int2
*
__restrict__
pmeAtomGridIndex
)
{
real
array
[
PME_ORDER
*
PME_ORDER
];
// The workspace array doesn't really need to be shared, but we have shared memory to spare, and putting it there
// reduces the load on L2 cache.
__shared__
real
sharedArray
[
PME_ORDER
*
PME_ORDER
*
64
];
real
*
array
=
&
sharedArray
[
PME_ORDER
*
PME_ORDER
*
threadIdx
.
x
];
real4
theta1
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
...
...
@@ -812,55 +816,55 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
tuv012
+=
tu01
*
v
.
z
;
tuv111
+=
tu11
*
v
.
y
;
}
phid
[
10
*
m
]
=
0
;
phid
[
10
*
m
+
1
]
=
tuv100_1
;
phid
[
10
*
m
+
2
]
=
tuv010_1
;
phid
[
10
*
m
+
3
]
=
tuv001_1
;
phid
[
10
*
m
+
4
]
=
tuv200_1
;
phid
[
10
*
m
+
5
]
=
tuv020_1
;
phid
[
10
*
m
+
6
]
=
tuv002_1
;
phid
[
10
*
m
+
7
]
=
tuv110_1
;
phid
[
10
*
m
+
8
]
=
tuv101_1
;
phid
[
10
*
m
+
9
]
=
tuv011_1
;
phip
[
10
*
m
]
=
0
;
phip
[
10
*
m
+
1
]
=
tuv100_2
;
phip
[
10
*
m
+
2
]
=
tuv010_2
;
phip
[
10
*
m
+
3
]
=
tuv001_2
;
phip
[
10
*
m
+
4
]
=
tuv200_2
;
phip
[
10
*
m
+
5
]
=
tuv020_2
;
phip
[
10
*
m
+
6
]
=
tuv002_2
;
phip
[
10
*
m
+
7
]
=
tuv110_2
;
phip
[
10
*
m
+
8
]
=
tuv101_2
;
phip
[
10
*
m
+
9
]
=
tuv011_2
;
phidp
[
20
*
m
]
=
tuv000
;
phidp
[
20
*
m
+
1
]
=
tuv100
;
phidp
[
20
*
m
+
2
]
=
tuv010
;
phidp
[
20
*
m
+
3
]
=
tuv001
;
phidp
[
20
*
m
+
4
]
=
tuv200
;
phidp
[
20
*
m
+
5
]
=
tuv020
;
phidp
[
20
*
m
+
6
]
=
tuv002
;
phidp
[
20
*
m
+
7
]
=
tuv110
;
phidp
[
20
*
m
+
8
]
=
tuv101
;
phidp
[
20
*
m
+
9
]
=
tuv011
;
phidp
[
20
*
m
+
10
]
=
tuv300
;
phidp
[
20
*
m
+
11
]
=
tuv030
;
phidp
[
20
*
m
+
12
]
=
tuv003
;
phidp
[
20
*
m
+
13
]
=
tuv210
;
phidp
[
20
*
m
+
14
]
=
tuv201
;
phidp
[
20
*
m
+
15
]
=
tuv120
;
phidp
[
20
*
m
+
16
]
=
tuv021
;
phidp
[
20
*
m
+
17
]
=
tuv102
;
phidp
[
20
*
m
+
18
]
=
tuv012
;
phidp
[
20
*
m
+
19
]
=
tuv111
;
phid
[
m
]
=
0
;
phid
[
m
+
NUM_ATOMS
]
=
tuv100_1
;
phid
[
m
+
NUM_ATOMS
*
2
]
=
tuv010_1
;
phid
[
m
+
NUM_ATOMS
*
3
]
=
tuv001_1
;
phid
[
m
+
NUM_ATOMS
*
4
]
=
tuv200_1
;
phid
[
m
+
NUM_ATOMS
*
5
]
=
tuv020_1
;
phid
[
m
+
NUM_ATOMS
*
6
]
=
tuv002_1
;
phid
[
m
+
NUM_ATOMS
*
7
]
=
tuv110_1
;
phid
[
m
+
NUM_ATOMS
*
8
]
=
tuv101_1
;
phid
[
m
+
NUM_ATOMS
*
9
]
=
tuv011_1
;
phip
[
m
]
=
0
;
phip
[
m
+
NUM_ATOMS
]
=
tuv100_2
;
phip
[
m
+
NUM_ATOMS
*
2
]
=
tuv010_2
;
phip
[
m
+
NUM_ATOMS
*
3
]
=
tuv001_2
;
phip
[
m
+
NUM_ATOMS
*
4
]
=
tuv200_2
;
phip
[
m
+
NUM_ATOMS
*
5
]
=
tuv020_2
;
phip
[
m
+
NUM_ATOMS
*
6
]
=
tuv002_2
;
phip
[
m
+
NUM_ATOMS
*
7
]
=
tuv110_2
;
phip
[
m
+
NUM_ATOMS
*
8
]
=
tuv101_2
;
phip
[
m
+
NUM_ATOMS
*
9
]
=
tuv011_2
;
phidp
[
m
]
=
tuv000
;
phidp
[
m
+
NUM_ATOMS
*
1
]
=
tuv100
;
phidp
[
m
+
NUM_ATOMS
*
2
]
=
tuv010
;
phidp
[
m
+
NUM_ATOMS
*
3
]
=
tuv001
;
phidp
[
m
+
NUM_ATOMS
*
4
]
=
tuv200
;
phidp
[
m
+
NUM_ATOMS
*
5
]
=
tuv020
;
phidp
[
m
+
NUM_ATOMS
*
6
]
=
tuv002
;
phidp
[
m
+
NUM_ATOMS
*
7
]
=
tuv110
;
phidp
[
m
+
NUM_ATOMS
*
8
]
=
tuv101
;
phidp
[
m
+
NUM_ATOMS
*
9
]
=
tuv011
;
phidp
[
m
+
NUM_ATOMS
*
10
]
=
tuv300
;
phidp
[
m
+
NUM_ATOMS
*
11
]
=
tuv030
;
phidp
[
m
+
NUM_ATOMS
*
12
]
=
tuv003
;
phidp
[
m
+
NUM_ATOMS
*
13
]
=
tuv210
;
phidp
[
m
+
NUM_ATOMS
*
14
]
=
tuv201
;
phidp
[
m
+
NUM_ATOMS
*
15
]
=
tuv120
;
phidp
[
m
+
NUM_ATOMS
*
16
]
=
tuv021
;
phidp
[
m
+
NUM_ATOMS
*
17
]
=
tuv102
;
phidp
[
m
+
NUM_ATOMS
*
18
]
=
tuv012
;
phidp
[
m
+
NUM_ATOMS
*
19
]
=
tuv111
;
}
}
extern
"C"
__global__
void
computeFixedMultipoleForceAndEnergy
(
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
long
long
*
__restrict__
torqueBuffers
,
real
*
__restrict__
energyBuffer
,
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
fracDipole
,
const
real
*
__restrict__
fracQuadrupole
,
const
real
*
__restrict__
phi
_global
,
const
real
*
__restrict__
cphi_global
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
const
real
*
__restrict__
phi
,
const
real
*
__restrict__
cphi_global
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real
multipole
[
10
];
const
int
deriv1
[]
=
{
1
,
4
,
7
,
8
,
10
,
15
,
17
,
13
,
14
,
19
};
const
int
deriv2
[]
=
{
2
,
7
,
5
,
9
,
13
,
11
,
18
,
15
,
19
,
16
};
...
...
@@ -922,13 +926,12 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
multipole
[
8
]
=
fracQuadrupole
[
i
*
6
+
2
];
multipole
[
9
]
=
fracQuadrupole
[
i
*
6
+
4
];
const
real
*
phi
=
&
phi_global
[
20
*
i
];
real4
f
=
make_real4
(
0
,
0
,
0
,
0
);
for
(
int
k
=
0
;
k
<
10
;
k
++
)
{
energy
+=
multipole
[
k
]
*
phi
[
k
];
f
.
x
+=
multipole
[
k
]
*
phi
[
deriv1
[
k
]];
f
.
y
+=
multipole
[
k
]
*
phi
[
deriv2
[
k
]];
f
.
z
+=
multipole
[
k
]
*
phi
[
deriv3
[
k
]];
energy
+=
multipole
[
k
]
*
phi
[
i
+
NUM_ATOMS
*
k
];
f
.
x
+=
multipole
[
k
]
*
phi
[
i
+
NUM_ATOMS
*
deriv1
[
k
]];
f
.
y
+=
multipole
[
k
]
*
phi
[
i
+
NUM_ATOMS
*
deriv2
[
k
]];
f
.
z
+=
multipole
[
k
]
*
phi
[
i
+
NUM_ATOMS
*
deriv3
[
k
]];
}
f
=
make_real4
(
EPSILON_FACTOR
*
(
f
.
x
*
fracToCart
[
0
][
0
]
+
f
.
y
*
fracToCart
[
0
][
1
]
+
f
.
z
*
fracToCart
[
0
][
2
]),
EPSILON_FACTOR
*
(
f
.
x
*
fracToCart
[
1
][
0
]
+
f
.
y
*
fracToCart
[
1
][
1
]
+
f
.
z
*
fracToCart
[
1
][
2
]),
...
...
@@ -944,8 +947,8 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
long
long
*
__restrict__
torqueBuffers
,
real
*
__restrict__
energyBuffer
,
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
fracDipole
,
const
real
*
__restrict__
fracQuadrupole
,
const
real
*
__restrict__
inducedDipole_global
,
const
real
*
__restrict__
inducedDipolePolar_global
,
const
real
*
__restrict__
phi
_global
,
const
real
*
__restrict__
phid
_global
,
const
real
*
__restrict__
phip
_global
,
const
real
*
__restrict__
phidp
_global
,
const
real
*
__restrict__
cphi_global
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
const
real
*
__restrict__
phi
,
const
real
*
__restrict__
phid
,
const
real
*
__restrict__
phip
,
const
real
*
__restrict__
phidp
,
const
real
*
__restrict__
cphi_global
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real
multipole
[
10
];
real
cinducedDipole
[
3
],
inducedDipole
[
3
];
real
cinducedDipolePolar
[
3
],
inducedDipolePolar
[
3
];
...
...
@@ -1023,34 +1026,30 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
inducedDipolePolar
[
0
]
=
cinducedDipolePolar
[
0
]
*
fracToCart
[
0
][
0
]
+
cinducedDipolePolar
[
1
]
*
fracToCart
[
1
][
0
]
+
cinducedDipolePolar
[
2
]
*
fracToCart
[
2
][
0
];
inducedDipolePolar
[
1
]
=
cinducedDipolePolar
[
0
]
*
fracToCart
[
0
][
1
]
+
cinducedDipolePolar
[
1
]
*
fracToCart
[
1
][
1
]
+
cinducedDipolePolar
[
2
]
*
fracToCart
[
2
][
1
];
inducedDipolePolar
[
2
]
=
cinducedDipolePolar
[
0
]
*
fracToCart
[
0
][
2
]
+
cinducedDipolePolar
[
1
]
*
fracToCart
[
1
][
2
]
+
cinducedDipolePolar
[
2
]
*
fracToCart
[
2
][
2
];
const
real
*
phi
=
&
phi_global
[
20
*
i
];
const
real
*
phip
=
&
phip_global
[
10
*
i
];
const
real
*
phid
=
&
phid_global
[
10
*
i
];
real4
f
=
make_real4
(
0
,
0
,
0
,
0
);
energy
+=
inducedDipole
[
0
]
*
phi
[
1
];
energy
+=
inducedDipole
[
1
]
*
phi
[
2
];
energy
+=
inducedDipole
[
2
]
*
phi
[
3
];
energy
+=
inducedDipole
[
0
]
*
phi
[
i
+
NUM_ATOMS
];
energy
+=
inducedDipole
[
1
]
*
phi
[
i
+
NUM_ATOMS
*
2
];
energy
+=
inducedDipole
[
2
]
*
phi
[
i
+
NUM_ATOMS
*
3
];
for
(
int
k
=
0
;
k
<
3
;
k
++
)
{
int
j1
=
deriv1
[
k
+
1
];
int
j2
=
deriv2
[
k
+
1
];
int
j3
=
deriv3
[
k
+
1
];
f
.
x
+=
(
inducedDipole
[
k
]
+
inducedDipolePolar
[
k
])
*
phi
[
j1
];
f
.
y
+=
(
inducedDipole
[
k
]
+
inducedDipolePolar
[
k
])
*
phi
[
j2
];
f
.
z
+=
(
inducedDipole
[
k
]
+
inducedDipolePolar
[
k
])
*
phi
[
j3
];
f
.
x
+=
(
inducedDipole
[
k
]
+
inducedDipolePolar
[
k
])
*
phi
[
i
+
NUM_ATOMS
*
j1
];
f
.
y
+=
(
inducedDipole
[
k
]
+
inducedDipolePolar
[
k
])
*
phi
[
i
+
NUM_ATOMS
*
j2
];
f
.
z
+=
(
inducedDipole
[
k
]
+
inducedDipolePolar
[
k
])
*
phi
[
i
+
NUM_ATOMS
*
j3
];
#ifndef DIRECT_POLARIZATION
f
.
x
+=
(
inducedDipole
[
k
]
*
phip
[
j1
]
+
inducedDipolePolar
[
k
]
*
phid
[
j1
]);
f
.
y
+=
(
inducedDipole
[
k
]
*
phip
[
j2
]
+
inducedDipolePolar
[
k
]
*
phid
[
j2
]);
f
.
z
+=
(
inducedDipole
[
k
]
*
phip
[
j3
]
+
inducedDipolePolar
[
k
]
*
phid
[
j3
]);
f
.
x
+=
(
inducedDipole
[
k
]
*
phip
[
i
+
NUM_ATOMS
*
j1
]
+
inducedDipolePolar
[
k
]
*
phid
[
i
+
NUM_ATOMS
*
j1
]);
f
.
y
+=
(
inducedDipole
[
k
]
*
phip
[
i
+
NUM_ATOMS
*
j2
]
+
inducedDipolePolar
[
k
]
*
phid
[
i
+
NUM_ATOMS
*
j2
]);
f
.
z
+=
(
inducedDipole
[
k
]
*
phip
[
i
+
NUM_ATOMS
*
j3
]
+
inducedDipolePolar
[
k
]
*
phid
[
i
+
NUM_ATOMS
*
j3
]);
#endif
}
const
real
*
phidp
=
&
phidp_global
[
20
*
i
];
for
(
int
k
=
0
;
k
<
10
;
k
++
)
{
f
.
x
+=
multipole
[
k
]
*
phidp
[
deriv1
[
k
]];
f
.
y
+=
multipole
[
k
]
*
phidp
[
deriv2
[
k
]];
f
.
z
+=
multipole
[
k
]
*
phidp
[
deriv3
[
k
]];
f
.
x
+=
multipole
[
k
]
*
phidp
[
i
+
NUM_ATOMS
*
deriv1
[
k
]];
f
.
y
+=
multipole
[
k
]
*
phidp
[
i
+
NUM_ATOMS
*
deriv2
[
k
]];
f
.
z
+=
multipole
[
k
]
*
phidp
[
i
+
NUM_ATOMS
*
deriv3
[
k
]];
}
f
=
make_real4
(
0.5
f
*
EPSILON_FACTOR
*
(
f
.
x
*
fracToCart
[
0
][
0
]
+
f
.
y
*
fracToCart
[
0
][
1
]
+
f
.
z
*
fracToCart
[
0
][
2
]),
0.5
f
*
EPSILON_FACTOR
*
(
f
.
x
*
fracToCart
[
1
][
0
]
+
f
.
y
*
fracToCart
[
1
][
1
]
+
f
.
z
*
fracToCart
[
1
][
2
]),
...
...
@@ -1078,11 +1077,11 @@ extern "C" __global__ void recordInducedFieldDipoles(const real* __restrict__ ph
}
__syncthreads
();
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
inducedField
[
i
]
-=
(
long
long
)
(
0x100000000
*
(
phid
[
10
*
i
+
1
]
*
fracToCart
[
0
][
0
]
+
phid
[
10
*
i
+
2
]
*
fracToCart
[
0
][
1
]
+
phid
[
10
*
i
+
3
]
*
fracToCart
[
0
][
2
]));
inducedField
[
i
+
PADDED_NUM_ATOMS
]
-=
(
long
long
)
(
0x100000000
*
(
phid
[
10
*
i
+
1
]
*
fracToCart
[
1
][
0
]
+
phid
[
10
*
i
+
2
]
*
fracToCart
[
1
][
1
]
+
phid
[
10
*
i
+
3
]
*
fracToCart
[
1
][
2
]));
inducedField
[
i
+
PADDED_NUM_ATOMS
*
2
]
-=
(
long
long
)
(
0x100000000
*
(
phid
[
10
*
i
+
1
]
*
fracToCart
[
2
][
0
]
+
phid
[
10
*
i
+
2
]
*
fracToCart
[
2
][
1
]
+
phid
[
10
*
i
+
3
]
*
fracToCart
[
2
][
2
]));
inducedFieldPolar
[
i
]
-=
(
long
long
)
(
0x100000000
*
(
phip
[
10
*
i
+
1
]
*
fracToCart
[
0
][
0
]
+
phip
[
10
*
i
+
2
]
*
fracToCart
[
0
][
1
]
+
phip
[
10
*
i
+
3
]
*
fracToCart
[
0
][
2
]));
inducedFieldPolar
[
i
+
PADDED_NUM_ATOMS
]
-=
(
long
long
)
(
0x100000000
*
(
phip
[
10
*
i
+
1
]
*
fracToCart
[
1
][
0
]
+
phip
[
10
*
i
+
2
]
*
fracToCart
[
1
][
1
]
+
phip
[
10
*
i
+
3
]
*
fracToCart
[
1
][
2
]));
inducedFieldPolar
[
i
+
PADDED_NUM_ATOMS
*
2
]
-=
(
long
long
)
(
0x100000000
*
(
phip
[
10
*
i
+
1
]
*
fracToCart
[
2
][
0
]
+
phip
[
10
*
i
+
2
]
*
fracToCart
[
2
][
1
]
+
phip
[
10
*
i
+
3
]
*
fracToCart
[
2
][
2
]));
inducedField
[
i
]
-=
(
long
long
)
(
0x100000000
*
(
phid
[
i
+
NUM_ATOMS
]
*
fracToCart
[
0
][
0
]
+
phid
[
i
+
NUM_ATOMS
*
2
]
*
fracToCart
[
0
][
1
]
+
phid
[
i
+
NUM_ATOMS
*
3
]
*
fracToCart
[
0
][
2
]));
inducedField
[
i
+
PADDED_NUM_ATOMS
]
-=
(
long
long
)
(
0x100000000
*
(
phid
[
i
+
NUM_ATOMS
]
*
fracToCart
[
1
][
0
]
+
phid
[
i
+
NUM_ATOMS
*
2
]
*
fracToCart
[
1
][
1
]
+
phid
[
i
+
NUM_ATOMS
*
3
]
*
fracToCart
[
1
][
2
]));
inducedField
[
i
+
PADDED_NUM_ATOMS
*
2
]
-=
(
long
long
)
(
0x100000000
*
(
phid
[
i
+
NUM_ATOMS
]
*
fracToCart
[
2
][
0
]
+
phid
[
i
+
NUM_ATOMS
*
2
]
*
fracToCart
[
2
][
1
]
+
phid
[
i
+
NUM_ATOMS
*
3
]
*
fracToCart
[
2
][
2
]));
inducedFieldPolar
[
i
]
-=
(
long
long
)
(
0x100000000
*
(
phip
[
i
+
NUM_ATOMS
]
*
fracToCart
[
0
][
0
]
+
phip
[
i
+
NUM_ATOMS
*
2
]
*
fracToCart
[
0
][
1
]
+
phip
[
i
+
NUM_ATOMS
*
3
]
*
fracToCart
[
0
][
2
]));
inducedFieldPolar
[
i
+
PADDED_NUM_ATOMS
]
-=
(
long
long
)
(
0x100000000
*
(
phip
[
i
+
NUM_ATOMS
]
*
fracToCart
[
1
][
0
]
+
phip
[
i
+
NUM_ATOMS
*
2
]
*
fracToCart
[
1
][
1
]
+
phip
[
i
+
NUM_ATOMS
*
3
]
*
fracToCart
[
1
][
2
]));
inducedFieldPolar
[
i
+
PADDED_NUM_ATOMS
*
2
]
-=
(
long
long
)
(
0x100000000
*
(
phip
[
i
+
NUM_ATOMS
]
*
fracToCart
[
2
][
0
]
+
phip
[
i
+
NUM_ATOMS
*
2
]
*
fracToCart
[
2
][
1
]
+
phip
[
i
+
NUM_ATOMS
*
3
]
*
fracToCart
[
2
][
2
]));
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment