Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
3b91c945
Commit
3b91c945
authored
Feb 05, 2015
by
Peter Eastman
Browse files
Continuing CUDA implementation of triclinic boxes for AmoebaMultipoleForce
parent
c83f2a12
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
176 additions
and
90 deletions
+176
-90
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+18
-7
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+3
-2
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
+155
-81
No files found.
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
View file @
3b91c945
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-201
3
Stanford University and the Authors. *
* Portions copyright (c) 2008-201
5
Stanford University and the Authors. *
* Authors: Peter Eastman, Mark Friedrichs *
* Contributors: *
* *
...
...
@@ -801,7 +801,7 @@ CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::stri
diisCoefficients
(
NULL
),
inducedDipolePolar
(
NULL
),
inducedDipoleErrors
(
NULL
),
prevDipoles
(
NULL
),
prevDipolesPolar
(
NULL
),
prevDipolesGk
(
NULL
),
prevDipolesGkPolar
(
NULL
),
prevErrors
(
NULL
),
diisMatrix
(
NULL
),
polarizability
(
NULL
),
covalentFlags
(
NULL
),
polarizationGroupFlags
(
NULL
),
pmeGrid
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeIgrid
(
NULL
),
pmePhi
(
NULL
),
pmePhid
(
NULL
),
pmePhip
(
NULL
),
pmePhidp
(
NULL
),
pmeAtomGridIndex
(
NULL
),
lastPositions
(
NULL
),
sort
(
NULL
),
gkKernel
(
NULL
)
{
pmePhid
(
NULL
),
pmePhip
(
NULL
),
pmePhidp
(
NULL
),
pmeCphi
(
NULL
),
pmeAtomGridIndex
(
NULL
),
lastPositions
(
NULL
),
sort
(
NULL
),
gkKernel
(
NULL
)
{
}
CudaCalcAmoebaMultipoleForceKernel
::~
CudaCalcAmoebaMultipoleForceKernel
()
{
...
...
@@ -876,6 +876,8 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
delete
pmePhip
;
if
(
pmePhidp
!=
NULL
)
delete
pmePhidp
;
if
(
pmeCphi
!=
NULL
)
delete
pmeCphi
;
if
(
pmeAtomGridIndex
!=
NULL
)
delete
pmeAtomGridIndex
;
if
(
lastPositions
!=
NULL
)
...
...
@@ -1192,6 +1194,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaAmoebaKernelSources
::
multipolePme
,
pmeDefines
);
pmeGridIndexKernel
=
cu
.
getKernel
(
module
,
"findAtomGridIndex"
);
pmeTransformMultipolesKernel
=
cu
.
getKernel
(
module
,
"transformMultipolesToFractionalCoordinates"
);
pmeTransformPotentialKernel
=
cu
.
getKernel
(
module
,
"transformPotentialToCartesianCoordinates"
);
pmeSpreadFixedMultipolesKernel
=
cu
.
getKernel
(
module
,
"gridSpreadFixedMultipoles"
);
pmeSpreadInducedDipolesKernel
=
cu
.
getKernel
(
module
,
"gridSpreadInducedDipoles"
);
pmeFinishSpreadChargeKernel
=
cu
.
getKernel
(
module
,
"finishSpreadCharge"
);
...
...
@@ -1219,6 +1222,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
pmePhid
=
new
CudaArray
(
cu
,
10
*
numMultipoles
,
elementSize
,
"pmePhid"
);
pmePhip
=
new
CudaArray
(
cu
,
10
*
numMultipoles
,
elementSize
,
"pmePhip"
);
pmePhidp
=
new
CudaArray
(
cu
,
20
*
numMultipoles
,
elementSize
,
"pmePhidp"
);
pmeCphi
=
new
CudaArray
(
cu
,
10
*
numMultipoles
,
elementSize
,
"pmeCphi"
);
pmeAtomRange
=
CudaArray
::
create
<
int
>
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomGridIndex
=
CudaArray
::
create
<
int2
>
(
cu
,
numMultipoles
,
"pmeAtomGridIndex"
);
sort
=
new
CudaSort
(
cu
,
new
SortTrait
(),
cu
.
getNumAtoms
());
...
...
@@ -1520,14 +1524,16 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
unsigned
int
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
void
*
gridIndexArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeGridIndexKernel
,
gridIndexArgs
,
cu
.
getNumAtoms
(),
cu
.
ThreadBlockSize
,
cu
.
ThreadBlockSize
*
PmeOrder
*
PmeOrder
*
elementSize
);
sort
->
sort
(
*
pmeAtomGridIndex
);
void
*
pmeTransformMultipolesArgs
[]
=
{
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
fracDipoles
->
getDevicePointer
(),
&
fracQuadrupoles
->
getDevicePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeTransformMultipolesKernel
,
pmeTransformMultipolesArgs
,
cu
.
getNumAtoms
());
void
*
pmeSpreadFixedMultipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
fracDipoles
->
getDevicePointer
(),
&
fracQuadrupoles
->
getDevicePointer
(),
&
pmeGrid
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
&
pmeGrid
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeSpreadFixedMultipolesKernel
,
pmeSpreadFixedMultipolesArgs
,
cu
.
getNumAtoms
());
void
*
finishSpreadArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
()};
if
(
cu
.
getUseDoublePrecision
())
...
...
@@ -1547,9 +1553,11 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&
fieldPolar
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
],
&
pmeAtomGridIndex
->
getDevicePointer
()};
cu
.
executeKernel
(
pmeFixedPotentialKernel
,
pmeFixedPotentialArgs
,
cu
.
getNumAtoms
());
void
*
pmeTransformFixedPotentialArgs
[]
=
{
&
pmePhi
->
getDevicePointer
(),
&
pmeCphi
->
getDevicePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeTransformPotentialKernel
,
pmeTransformFixedPotentialArgs
,
cu
.
getNumAtoms
());
void
*
pmeFixedForceArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
torque
->
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
fracDipoles
->
getDevicePointer
(),
&
fracQuadrupoles
->
getDevicePointer
(),
&
pmePhi
->
getDevicePointer
(),
&
fracDipoles
->
getDevicePointer
(),
&
fracQuadrupoles
->
getDevicePointer
(),
&
pmePhi
->
getDevicePointer
(),
&
pmeCphi
->
getDevicePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeFixedForceKernel
,
pmeFixedForceArgs
,
cu
.
getNumAtoms
());
...
...
@@ -1570,7 +1578,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cu
.
clearBuffer
(
*
pmeGrid
);
void
*
pmeSpreadInducedDipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
pmeGrid
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
&
pmeGrid
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeSpreadInducedDipolesKernel
,
pmeSpreadInducedDipolesArgs
,
cu
.
getNumAtoms
());
if
(
cu
.
getUseDoublePrecision
())
cu
.
executeKernel
(
pmeFinishSpreadChargeKernel
,
finishSpreadArgs
,
pmeGrid
->
getSize
());
...
...
@@ -1634,11 +1643,13 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
dampingAndThole
->
getDevicePointer
()};
cu
.
executeKernel
(
electrostaticsKernel
,
electrostaticsArgs
,
numForceThreadBlocks
*
electrostaticsThreads
,
electrostaticsThreads
);
void
*
pmeTransformInducedPotentialArgs
[]
=
{
&
pmePhidp
->
getDevicePointer
(),
&
pmeCphi
->
getDevicePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeTransformPotentialKernel
,
pmeTransformInducedPotentialArgs
,
cu
.
getNumAtoms
());
void
*
pmeInducedForceArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
torque
->
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
fracDipoles
->
getDevicePointer
(),
&
fracQuadrupoles
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
pmePhi
->
getDevicePointer
(),
&
pmePhid
->
getDevicePointer
(),
&
pmePhip
->
getDevicePointer
(),
&
pmePhidp
->
getDevicePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
&
pmePhip
->
getDevicePointer
(),
&
pmePhidp
->
getDevicePointer
(),
&
pmeCphi
->
getDevicePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeInducedForceKernel
,
pmeInducedForceArgs
,
cu
.
getNumAtoms
());
}
...
...
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
View file @
3b91c945
...
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-201
3
Stanford University and the Authors. *
* Portions copyright (c) 2008-201
5
Stanford University and the Authors. *
* Authors: Mark Friedrichs, Peter Eastman *
* Contributors: *
* *
...
...
@@ -421,6 +421,7 @@ private:
CudaArray
*
pmePhid
;
CudaArray
*
pmePhip
;
CudaArray
*
pmePhidp
;
CudaArray
*
pmeCphi
;
CudaArray
*
pmeAtomRange
;
CudaArray
*
pmeAtomGridIndex
;
CudaArray
*
lastPositions
;
...
...
@@ -430,7 +431,7 @@ private:
CUfunction
pmeGridIndexKernel
,
pmeSpreadFixedMultipolesKernel
,
pmeSpreadInducedDipolesKernel
,
pmeFinishSpreadChargeKernel
,
pmeConvolutionKernel
;
CUfunction
pmeFixedPotentialKernel
,
pmeInducedPotentialKernel
,
pmeFixedForceKernel
,
pmeInducedForceKernel
,
pmeRecordInducedFieldDipolesKernel
,
computePotentialKernel
;
CUfunction
recordDIISDipolesKernel
,
buildMatrixKernel
;
CUfunction
pmeTransformMultipolesKernel
;
CUfunction
pmeTransformMultipolesKernel
,
pmeTransformPotentialKernel
;
CudaCalcAmoebaGeneralizedKirkwoodForceKernel
*
gkKernel
;
static
const
int
PmeOrder
=
5
;
static
const
int
MaxPrevDIISDipoles
=
20
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
View file @
3b91c945
...
...
@@ -73,12 +73,12 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
* Compute the index of the grid point each atom is associated with.
*/
extern
"C"
__global__
void
findAtomGridIndex
(
const
real4
*
__restrict__
posq
,
int2
*
__restrict__
pmeAtomGridIndex
,
real4
periodicBox
Size
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real4
periodicBox
VecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
real4
pos
=
posq
[
i
];
pos
.
x
-=
floor
(
pos
.
x
*
recipBoxVec
X
.
x
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
(
pos
.
y
*
recipBoxVecY
.
y
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
(
pos
.
z
*
recipBoxVec
Z
.
z
)
*
periodicBoxSize
.
z
;
pos
-=
periodicBoxVecZ
*
floor
(
pos
.
z
*
recipBoxVec
Z
.
z
+
0.5
f
)
;
pos
-=
periodicBoxVecY
*
floor
(
pos
.
y
*
recipBoxVecY
.
z
+
0.5
f
)
;
pos
-=
periodicBoxVecX
*
floor
(
pos
.
x
*
recipBoxVec
X
.
z
+
0.5
f
)
;
// First axis.
...
...
@@ -109,6 +109,7 @@ extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int
pmeAtomGridIndex
[
i
]
=
make_int2
(
i
,
igrid1
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
igrid2
*
GRID_SIZE_Z
+
igrid3
);
}
}
/**
* Convert the fixed multipoles from Cartesian to fractional coordinates.
*/
...
...
@@ -161,9 +162,55 @@ extern "C" __global__ void transformMultipolesToFractionalCoordinates(const real
}
}
/**
* Convert the potential from fractional to Cartesian coordinates.
*/
extern
"C"
__global__
void
transformPotentialToCartesianCoordinates
(
const
real
*
__restrict__
fphi
,
real
*
__restrict__
cphi
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
// Build matrices for transforming the potential.
__shared__
real
a
[
3
][
3
];
if
(
threadIdx
.
x
==
0
)
{
a
[
0
][
0
]
=
GRID_SIZE_X
*
recipBoxVecX
.
x
;
a
[
1
][
0
]
=
GRID_SIZE_X
*
recipBoxVecY
.
x
;
a
[
2
][
0
]
=
GRID_SIZE_X
*
recipBoxVecZ
.
x
;
a
[
0
][
1
]
=
GRID_SIZE_Y
*
recipBoxVecX
.
y
;
a
[
1
][
1
]
=
GRID_SIZE_Y
*
recipBoxVecY
.
y
;
a
[
2
][
1
]
=
GRID_SIZE_Y
*
recipBoxVecZ
.
y
;
a
[
0
][
2
]
=
GRID_SIZE_Z
*
recipBoxVecX
.
z
;
a
[
1
][
2
]
=
GRID_SIZE_Z
*
recipBoxVecY
.
z
;
a
[
2
][
2
]
=
GRID_SIZE_Z
*
recipBoxVecZ
.
z
;
}
__syncthreads
();
int
index1
[]
=
{
0
,
1
,
2
,
0
,
0
,
1
};
int
index2
[]
=
{
0
,
1
,
2
,
1
,
2
,
2
};
__shared__
real
b
[
6
][
6
];
if
(
threadIdx
.
x
<
36
)
{
int
i
=
threadIdx
.
x
/
6
;
int
j
=
threadIdx
.
x
-
6
*
i
;
b
[
i
][
j
]
=
a
[
index1
[
i
]][
index1
[
j
]]
*
a
[
index2
[
i
]][
index2
[
j
]];
if
(
index1
[
i
]
!=
index2
[
i
])
b
[
i
][
j
]
+=
(
i
<
3
?
b
[
i
][
j
]
:
a
[
index1
[
i
]][
index2
[
j
]]
*
a
[
index2
[
i
]][
index1
[
j
]]);
}
__syncthreads
();
// Transform the potential.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
cphi
[
10
*
i
]
=
fphi
[
20
*
i
];
cphi
[
10
*
i
+
1
]
=
a
[
0
][
0
]
*
fphi
[
20
*
i
+
1
]
+
a
[
0
][
1
]
*
fphi
[
20
*
i
+
2
]
+
a
[
0
][
2
]
*
fphi
[
20
*
i
+
3
];
cphi
[
10
*
i
+
2
]
=
a
[
1
][
0
]
*
fphi
[
20
*
i
+
1
]
+
a
[
1
][
1
]
*
fphi
[
20
*
i
+
2
]
+
a
[
1
][
2
]
*
fphi
[
20
*
i
+
3
];
cphi
[
10
*
i
+
3
]
=
a
[
2
][
0
]
*
fphi
[
20
*
i
+
1
]
+
a
[
2
][
1
]
*
fphi
[
20
*
i
+
2
]
+
a
[
2
][
2
]
*
fphi
[
20
*
i
+
3
];
for
(
int
j
=
0
;
j
<
6
;
j
++
)
{
cphi
[
10
*
i
+
4
+
j
]
=
0
;
for
(
int
k
=
0
;
k
<
6
;
k
++
)
cphi
[
10
*
i
+
4
+
j
]
+=
b
[
j
][
k
]
*
fphi
[
20
*
i
+
4
+
k
];
}
}
}
extern
"C"
__global__
void
gridSpreadFixedMultipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
fracDipole
,
const
real
*
__restrict__
fracQuadrupole
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
real4
periodicBox
Size
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real4
periodicBox
VecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real
array
[
PME_ORDER
*
PME_ORDER
];
real4
theta1
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
...
...
@@ -175,28 +222,28 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
m
=
pmeAtomGridIndex
[
i
].
x
;
real4
pos
=
posq
[
m
];
pos
.
x
-=
floor
(
pos
.
x
*
recipBoxVec
X
.
x
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
(
pos
.
y
*
recipBoxVecY
.
y
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
(
pos
.
z
*
recipBoxVec
Z
.
z
)
*
periodicBoxSize
.
z
;
pos
-=
periodicBoxVecZ
*
floor
(
pos
.
z
*
recipBoxVec
Z
.
z
+
0.5
f
)
;
pos
-=
periodicBoxVecY
*
floor
(
pos
.
y
*
recipBoxVecY
.
z
+
0.5
f
)
;
pos
-=
periodicBoxVecX
*
floor
(
pos
.
x
*
recipBoxVec
X
.
z
+
0.5
f
)
;
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real
w
=
pos
.
x
*
recipBoxVecX
.
x
+
pos
.
y
*
recipBoxVecY
.
x
+
pos
.
z
*
recipBoxVecZ
.
x
;
real
fr
=
GRID_SIZE_X
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
int
ifr
=
(
int
)
f
r
;
int
ifr
=
(
int
)
f
loor
(
fr
)
;
w
=
fr
-
ifr
;
int
igrid1
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta1
,
w
,
array
);
w
=
pos
.
y
*
recipBoxVecY
.
y
+
pos
.
z
*
recipBoxVecZ
.
y
;
fr
=
GRID_SIZE_Y
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
f
r
;
ifr
=
(
int
)
f
loor
(
fr
)
;
w
=
fr
-
ifr
;
int
igrid2
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta2
,
w
,
array
);
w
=
pos
.
z
*
recipBoxVecZ
.
z
;
fr
=
GRID_SIZE_Z
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
f
r
;
ifr
=
(
int
)
f
loor
(
fr
)
;
w
=
fr
-
ifr
;
int
igrid3
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta3
,
w
,
array
);
...
...
@@ -252,14 +299,24 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
extern
"C"
__global__
void
gridSpreadInducedDipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
real4
periodicBoxSize
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
const
real
xscale
=
GRID_SIZE_X
*
recipBoxVecX
.
x
;
const
real
yscale
=
GRID_SIZE_Y
*
recipBoxVecY
.
y
;
const
real
zscale
=
GRID_SIZE_Z
*
recipBoxVecZ
.
z
;
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real
array
[
PME_ORDER
*
PME_ORDER
];
real4
theta1
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
__shared__
real
cartToFrac
[
3
][
3
];
if
(
threadIdx
.
x
==
0
)
{
cartToFrac
[
0
][
0
]
=
GRID_SIZE_X
*
recipBoxVecX
.
x
;
cartToFrac
[
0
][
1
]
=
GRID_SIZE_X
*
recipBoxVecY
.
x
;
cartToFrac
[
0
][
2
]
=
GRID_SIZE_X
*
recipBoxVecZ
.
x
;
cartToFrac
[
1
][
0
]
=
GRID_SIZE_Y
*
recipBoxVecX
.
y
;
cartToFrac
[
1
][
1
]
=
GRID_SIZE_Y
*
recipBoxVecY
.
y
;
cartToFrac
[
1
][
2
]
=
GRID_SIZE_Y
*
recipBoxVecZ
.
y
;
cartToFrac
[
2
][
0
]
=
GRID_SIZE_Z
*
recipBoxVecX
.
z
;
cartToFrac
[
2
][
1
]
=
GRID_SIZE_Z
*
recipBoxVecY
.
z
;
cartToFrac
[
2
][
2
]
=
GRID_SIZE_Z
*
recipBoxVecZ
.
z
;
}
__syncthreads
();
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
...
...
@@ -267,28 +324,28 @@ extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ po
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
m
=
pmeAtomGridIndex
[
i
].
x
;
real4
pos
=
posq
[
m
];
pos
.
x
-=
floor
(
pos
.
x
*
recipBoxVec
X
.
x
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
(
pos
.
y
*
recipBoxVecY
.
y
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
(
pos
.
z
*
recipBoxVec
Z
.
z
)
*
periodicBoxSize
.
z
;
pos
-=
periodicBoxVecZ
*
floor
(
pos
.
z
*
recipBoxVec
Z
.
z
+
0.5
f
)
;
pos
-=
periodicBoxVecY
*
floor
(
pos
.
y
*
recipBoxVecY
.
z
+
0.5
f
)
;
pos
-=
periodicBoxVecX
*
floor
(
pos
.
x
*
recipBoxVec
X
.
z
+
0.5
f
)
;
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real
w
=
pos
.
x
*
recipBoxVecX
.
x
+
pos
.
y
*
recipBoxVecY
.
x
+
pos
.
z
*
recipBoxVecZ
.
x
;
real
fr
=
GRID_SIZE_X
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
int
ifr
=
(
int
)
f
r
;
int
ifr
=
(
int
)
f
loor
(
fr
)
;
w
=
fr
-
ifr
;
int
igrid1
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta1
,
w
,
array
);
w
=
pos
.
y
*
recipBoxVecY
.
y
+
pos
.
z
*
recipBoxVecZ
.
y
;
fr
=
GRID_SIZE_Y
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
f
r
;
ifr
=
(
int
)
f
loor
(
fr
)
;
w
=
fr
-
ifr
;
int
igrid2
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta2
,
w
,
array
);
w
=
pos
.
z
*
recipBoxVecZ
.
z
;
fr
=
GRID_SIZE_Z
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
f
r
;
ifr
=
(
int
)
f
loor
(
fr
)
;
w
=
fr
-
ifr
;
int
igrid3
=
ifr
-
PME_ORDER
+
1
;
computeBSplinePoint
(
theta3
,
w
,
array
);
...
...
@@ -316,16 +373,18 @@ extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ po
int
index
=
ybase
+
zindex
;
real4
v
=
theta3
[
iz
];
real
inducedDipoleX
=
xscale
*
inducedDipole
[
m
*
3
];
real
inducedDipoleY
=
yscale
*
inducedDipole
[
m
*
3
+
1
];
real
inducedDipoleZ
=
zscale
*
inducedDipole
[
m
*
3
+
2
];
real
inducedDipolePolarX
=
xscale
*
inducedDipolePolar
[
m
*
3
];
real
inducedDipolePolarY
=
yscale
*
inducedDipolePolar
[
m
*
3
+
1
];
real
inducedDipolePolarZ
=
zscale
*
inducedDipolePolar
[
m
*
3
+
2
];
real
term01
=
inducedDipoleY
*
u
.
y
*
v
.
x
+
inducedDipoleZ
*
u
.
x
*
v
.
y
;
real
term11
=
inducedDipoleX
*
u
.
x
*
v
.
x
;
real
term02
=
inducedDipolePolarY
*
u
.
y
*
v
.
x
+
inducedDipolePolarZ
*
u
.
x
*
v
.
y
;
real
term12
=
inducedDipolePolarX
*
u
.
x
*
v
.
x
;
real3
cinducedDipole
=
make_real3
(
inducedDipole
[
m
*
3
],
inducedDipole
[
m
*
3
+
1
],
inducedDipole
[
m
*
3
+
2
]);
real3
cinducedDipolePolar
=
make_real3
(
inducedDipolePolar
[
m
*
3
],
inducedDipolePolar
[
m
*
3
+
1
],
inducedDipolePolar
[
m
*
3
+
2
]);
real3
finducedDipole
=
make_real3
(
cinducedDipole
.
x
*
cartToFrac
[
0
][
0
]
+
cinducedDipole
.
y
*
cartToFrac
[
0
][
1
]
+
cinducedDipole
.
z
*
cartToFrac
[
0
][
2
],
cinducedDipole
.
x
*
cartToFrac
[
1
][
0
]
+
cinducedDipole
.
y
*
cartToFrac
[
1
][
1
]
+
cinducedDipole
.
z
*
cartToFrac
[
1
][
2
],
cinducedDipole
.
x
*
cartToFrac
[
2
][
0
]
+
cinducedDipole
.
y
*
cartToFrac
[
2
][
1
]
+
cinducedDipole
.
z
*
cartToFrac
[
2
][
2
]);
real3
finducedDipolePolar
=
make_real3
(
cinducedDipolePolar
.
x
*
cartToFrac
[
0
][
0
]
+
cinducedDipolePolar
.
y
*
cartToFrac
[
0
][
1
]
+
cinducedDipolePolar
.
z
*
cartToFrac
[
0
][
2
],
cinducedDipolePolar
.
x
*
cartToFrac
[
1
][
0
]
+
cinducedDipolePolar
.
y
*
cartToFrac
[
1
][
1
]
+
cinducedDipolePolar
.
z
*
cartToFrac
[
1
][
2
],
cinducedDipolePolar
.
x
*
cartToFrac
[
2
][
0
]
+
cinducedDipolePolar
.
y
*
cartToFrac
[
2
][
1
]
+
cinducedDipolePolar
.
z
*
cartToFrac
[
2
][
2
]);
real
term01
=
finducedDipole
.
y
*
u
.
y
*
v
.
x
+
finducedDipole
.
z
*
u
.
x
*
v
.
y
;
real
term11
=
finducedDipole
.
x
*
u
.
x
*
v
.
x
;
real
term02
=
finducedDipolePolar
.
y
*
u
.
y
*
v
.
x
+
finducedDipolePolar
.
z
*
u
.
x
*
v
.
y
;
real
term12
=
finducedDipolePolar
.
x
*
u
.
x
*
v
.
x
;
real
add1
=
term01
*
t
.
x
+
term11
*
t
.
y
;
real
add2
=
term02
*
t
.
x
+
term12
*
t
.
y
;
#ifdef USE_DOUBLE_PRECISION
...
...
@@ -392,6 +451,19 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
real4
theta1
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
__shared__
real
fracToCart
[
3
][
3
];
if
(
threadIdx
.
x
==
0
)
{
fracToCart
[
0
][
0
]
=
GRID_SIZE_X
*
recipBoxVecX
.
x
;
fracToCart
[
1
][
0
]
=
GRID_SIZE_X
*
recipBoxVecY
.
x
;
fracToCart
[
2
][
0
]
=
GRID_SIZE_X
*
recipBoxVecZ
.
x
;
fracToCart
[
0
][
1
]
=
GRID_SIZE_Y
*
recipBoxVecX
.
y
;
fracToCart
[
1
][
1
]
=
GRID_SIZE_Y
*
recipBoxVecY
.
y
;
fracToCart
[
2
][
1
]
=
GRID_SIZE_Y
*
recipBoxVecZ
.
y
;
fracToCart
[
0
][
2
]
=
GRID_SIZE_Z
*
recipBoxVecX
.
z
;
fracToCart
[
1
][
2
]
=
GRID_SIZE_Z
*
recipBoxVecY
.
z
;
fracToCart
[
2
][
2
]
=
GRID_SIZE_Z
*
recipBoxVecZ
.
z
;
}
__syncthreads
();
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
...
...
@@ -530,13 +602,13 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
phi
[
20
*
m
+
18
]
=
tuv012
;
phi
[
20
*
m
+
19
]
=
tuv111
;
real
dipoleScale
=
(
4
/
(
real
)
3
)
*
(
EWALD_ALPHA
*
EWALD_ALPHA
*
EWALD_ALPHA
)
/
SQRT_PI
;
long
long
fieldx
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
]
-
GRID_SIZE_X
*
recipBoxVecX
.
x
*
tuv10
0
)
*
0x100000000
);
long
long
fieldx
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
]
-
tuv100
*
fracToCart
[
0
][
0
]
-
tuv
0
10
*
fracToCart
[
0
][
1
]
-
tuv001
*
fracToCart
[
0
][
2
]
)
*
0x100000000
);
fieldBuffers
[
m
]
=
fieldx
;
fieldPolarBuffers
[
m
]
=
fieldx
;
long
long
fieldy
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
+
1
]
-
GRID_SIZE_Y
*
recipBoxVecY
.
y
*
tuv010
)
*
0x100000000
);
long
long
fieldy
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
+
1
]
-
tuv100
*
fracToCart
[
1
][
0
]
-
tuv010
*
fracToCart
[
1
][
1
]
-
tuv001
*
fracToCart
[
1
][
2
]
)
*
0x100000000
);
fieldBuffers
[
m
+
PADDED_NUM_ATOMS
]
=
fieldy
;
fieldPolarBuffers
[
m
+
PADDED_NUM_ATOMS
]
=
fieldy
;
long
long
fieldz
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
+
2
]
-
GRID_SIZE_Z
*
recipBoxVecZ
.
z
*
tuv001
)
*
0x100000000
);
long
long
fieldz
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
+
2
]
-
tuv100
*
fracToCart
[
2
][
0
]
-
tuv010
*
fracToCart
[
2
][
1
]
-
tuv001
*
fracToCart
[
2
][
2
]
)
*
0x100000000
);
fieldBuffers
[
m
+
2
*
PADDED_NUM_ATOMS
]
=
fieldz
;
fieldPolarBuffers
[
m
+
2
*
PADDED_NUM_ATOMS
]
=
fieldz
;
}
...
...
@@ -786,14 +858,11 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
extern
"C"
__global__
void
computeFixedMultipoleForceAndEnergy
(
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
long
long
*
__restrict__
torqueBuffers
,
real
*
__restrict__
energyBuffer
,
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
fracDipole
,
const
real
*
__restrict__
fracQuadrupole
,
const
real
*
__restrict__
phi_global
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
const
real
*
__restrict__
phi_global
,
const
real
*
__restrict__
cphi_global
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real
multipole
[
10
];
const
int
deriv1
[]
=
{
1
,
4
,
7
,
8
,
10
,
15
,
17
,
13
,
14
,
19
};
const
int
deriv2
[]
=
{
2
,
7
,
5
,
9
,
13
,
11
,
18
,
15
,
19
,
16
};
const
int
deriv3
[]
=
{
3
,
8
,
9
,
6
,
14
,
16
,
12
,
19
,
17
,
18
};
const
real
xscale
=
GRID_SIZE_X
*
recipBoxVecX
.
x
;
const
real
yscale
=
GRID_SIZE_Y
*
recipBoxVecY
.
y
;
const
real
zscale
=
GRID_SIZE_Z
*
recipBoxVecZ
.
z
;
real
energy
=
0
;
__shared__
real
fracToCart
[
3
][
3
];
if
(
threadIdx
.
x
==
0
)
{
...
...
@@ -822,22 +891,22 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
multipole
[
8
]
=
2
*
labFrameQuadrupole
[
i
*
5
+
2
];
multipole
[
9
]
=
2
*
labFrameQuadrupole
[
i
*
5
+
4
];
const
real
*
phi
=
&
phi_global
[
2
0
*
i
];
const
real
*
c
phi
=
&
c
phi_global
[
1
0
*
i
];
torqueBuffers
[
i
]
=
(
long
long
)
(
EPSILON_FACTOR
*
(
multipole
[
3
]
*
yscale
*
phi
[
2
]
-
multipole
[
2
]
*
zscale
*
phi
[
3
]
+
2
*
(
multipole
[
6
]
-
multipole
[
5
])
*
yscale
*
zscale
*
phi
[
9
]
+
multipole
[
8
]
*
xscale
*
yscale
*
phi
[
7
]
+
multipole
[
9
]
*
yscale
*
yscale
*
phi
[
5
]
-
multipole
[
7
]
*
xscale
*
zscale
*
phi
[
8
]
-
multipole
[
9
]
*
zscale
*
zscale
*
phi
[
6
])
*
0x100000000
);
torqueBuffers
[
i
]
=
(
long
long
)
(
EPSILON_FACTOR
*
(
multipole
[
3
]
*
c
phi
[
2
]
-
multipole
[
2
]
*
c
phi
[
3
]
+
2
*
(
multipole
[
6
]
-
multipole
[
5
])
*
c
phi
[
9
]
+
multipole
[
8
]
*
c
phi
[
7
]
+
multipole
[
9
]
*
c
phi
[
5
]
-
multipole
[
7
]
*
c
phi
[
8
]
-
multipole
[
9
]
*
c
phi
[
6
])
*
0x100000000
);
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
]
=
(
long
long
)
(
EPSILON_FACTOR
*
(
multipole
[
1
]
*
zscale
*
phi
[
3
]
-
multipole
[
3
]
*
xscale
*
phi
[
1
]
+
2
*
(
multipole
[
4
]
-
multipole
[
6
])
*
xscale
*
zscale
*
phi
[
8
]
+
multipole
[
7
]
*
yscale
*
zscale
*
phi
[
9
]
+
multipole
[
8
]
*
zscale
*
zscale
*
phi
[
6
]
-
multipole
[
8
]
*
xscale
*
xscale
*
phi
[
4
]
-
multipole
[
9
]
*
xscale
*
yscale
*
phi
[
7
])
*
0x100000000
);
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
]
=
(
long
long
)
(
EPSILON_FACTOR
*
(
multipole
[
1
]
*
c
phi
[
3
]
-
multipole
[
3
]
*
c
phi
[
1
]
+
2
*
(
multipole
[
4
]
-
multipole
[
6
])
*
c
phi
[
8
]
+
multipole
[
7
]
*
c
phi
[
9
]
+
multipole
[
8
]
*
c
phi
[
6
]
-
multipole
[
8
]
*
c
phi
[
4
]
-
multipole
[
9
]
*
c
phi
[
7
])
*
0x100000000
);
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
*
2
]
=
(
long
long
)
(
EPSILON_FACTOR
*
(
multipole
[
2
]
*
xscale
*
phi
[
1
]
-
multipole
[
1
]
*
yscale
*
phi
[
2
]
+
2
*
(
multipole
[
5
]
-
multipole
[
4
])
*
xscale
*
yscale
*
phi
[
7
]
+
multipole
[
7
]
*
xscale
*
xscale
*
phi
[
4
]
+
multipole
[
9
]
*
xscale
*
zscale
*
phi
[
8
]
-
multipole
[
7
]
*
yscale
*
yscale
*
phi
[
5
]
-
multipole
[
8
]
*
yscale
*
zscale
*
phi
[
9
])
*
0x100000000
);
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
*
2
]
=
(
long
long
)
(
EPSILON_FACTOR
*
(
multipole
[
2
]
*
c
phi
[
1
]
-
multipole
[
1
]
*
c
phi
[
2
]
+
2
*
(
multipole
[
5
]
-
multipole
[
4
])
*
c
phi
[
7
]
+
multipole
[
7
]
*
c
phi
[
4
]
+
multipole
[
9
]
*
c
phi
[
8
]
-
multipole
[
7
]
*
c
phi
[
5
]
-
multipole
[
8
]
*
c
phi
[
9
])
*
0x100000000
);
// Compute the force and energy.
...
...
@@ -851,6 +920,7 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
multipole
[
8
]
=
fracQuadrupole
[
i
*
6
+
2
];
multipole
[
9
]
=
fracQuadrupole
[
i
*
6
+
4
];
const
real
*
phi
=
&
phi_global
[
20
*
i
];
real4
f
=
make_real4
(
0
,
0
,
0
,
0
);
for
(
int
k
=
0
;
k
<
10
;
k
++
)
{
energy
+=
multipole
[
k
]
*
phi
[
k
];
...
...
@@ -873,20 +943,13 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
fracDipole
,
const
real
*
__restrict__
fracQuadrupole
,
const
real
*
__restrict__
inducedDipole_global
,
const
real
*
__restrict__
inducedDipolePolar_global
,
const
real
*
__restrict__
phi_global
,
const
real
*
__restrict__
phid_global
,
const
real
*
__restrict__
phip_global
,
const
real
*
__restrict__
phidp_global
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
const
real
*
__restrict__
phidp_global
,
const
real
*
__restrict__
cphi_global
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real
multipole
[
10
];
real
cinducedDipole
[
3
],
inducedDipole
[
3
];
real
cinducedDipolePolar
[
3
],
inducedDipolePolar
[
3
];
real
scales
[
3
];
const
int
deriv1
[]
=
{
1
,
4
,
7
,
8
,
10
,
15
,
17
,
13
,
14
,
19
};
const
int
deriv2
[]
=
{
2
,
7
,
5
,
9
,
13
,
11
,
18
,
15
,
19
,
16
};
const
int
deriv3
[]
=
{
3
,
8
,
9
,
6
,
14
,
16
,
12
,
19
,
17
,
18
};
const
real
xscale
=
GRID_SIZE_X
*
recipBoxVecX
.
x
;
const
real
yscale
=
GRID_SIZE_Y
*
recipBoxVecY
.
y
;
const
real
zscale
=
GRID_SIZE_Z
*
recipBoxVecZ
.
z
;
scales
[
0
]
=
xscale
;
scales
[
1
]
=
yscale
;
scales
[
2
]
=
zscale
;
real
energy
=
0
;
__shared__
real
fracToCart
[
3
][
3
];
if
(
threadIdx
.
x
==
0
)
{
...
...
@@ -914,22 +977,22 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
multipole
[
7
]
=
2
*
labFrameQuadrupole
[
i
*
5
+
1
];
multipole
[
8
]
=
2
*
labFrameQuadrupole
[
i
*
5
+
2
];
multipole
[
9
]
=
2
*
labFrameQuadrupole
[
i
*
5
+
4
];
const
real
*
phi
dp
=
&
phi
dp
_global
[
2
0
*
i
];
const
real
*
c
phi
=
&
c
phi_global
[
1
0
*
i
];
torqueBuffers
[
i
]
+=
(
long
long
)
(
0.5
f
*
EPSILON_FACTOR
*
(
multipole
[
3
]
*
yscale
*
phi
dp
[
2
]
-
multipole
[
2
]
*
zscale
*
phi
dp
[
3
]
+
2
*
(
multipole
[
6
]
-
multipole
[
5
])
*
yscale
*
zscale
*
phi
dp
[
9
]
+
multipole
[
8
]
*
xscale
*
yscale
*
phi
dp
[
7
]
+
multipole
[
9
]
*
yscale
*
yscale
*
phi
dp
[
5
]
-
multipole
[
7
]
*
xscale
*
zscale
*
phi
dp
[
8
]
-
multipole
[
9
]
*
zscale
*
zscale
*
phi
dp
[
6
])
*
0x100000000
);
torqueBuffers
[
i
]
+=
(
long
long
)
(
0.5
f
*
EPSILON_FACTOR
*
(
multipole
[
3
]
*
c
phi
[
2
]
-
multipole
[
2
]
*
c
phi
[
3
]
+
2
*
(
multipole
[
6
]
-
multipole
[
5
])
*
c
phi
[
9
]
+
multipole
[
8
]
*
c
phi
[
7
]
+
multipole
[
9
]
*
c
phi
[
5
]
-
multipole
[
7
]
*
c
phi
[
8
]
-
multipole
[
9
]
*
c
phi
[
6
])
*
0x100000000
);
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
]
+=
(
long
long
)
(
0.5
f
*
EPSILON_FACTOR
*
(
multipole
[
1
]
*
zscale
*
phi
dp
[
3
]
-
multipole
[
3
]
*
xscale
*
phi
dp
[
1
]
+
2
*
(
multipole
[
4
]
-
multipole
[
6
])
*
xscale
*
zscale
*
phi
dp
[
8
]
+
multipole
[
7
]
*
yscale
*
zscale
*
phi
dp
[
9
]
+
multipole
[
8
]
*
zscale
*
zscale
*
phi
dp
[
6
]
-
multipole
[
8
]
*
xscale
*
xscale
*
phi
dp
[
4
]
-
multipole
[
9
]
*
xscale
*
yscale
*
phi
dp
[
7
])
*
0x100000000
);
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
]
+=
(
long
long
)
(
0.5
f
*
EPSILON_FACTOR
*
(
multipole
[
1
]
*
c
phi
[
3
]
-
multipole
[
3
]
*
c
phi
[
1
]
+
2
*
(
multipole
[
4
]
-
multipole
[
6
])
*
c
phi
[
8
]
+
multipole
[
7
]
*
c
phi
[
9
]
+
multipole
[
8
]
*
c
phi
[
6
]
-
multipole
[
8
]
*
c
phi
[
4
]
-
multipole
[
9
]
*
c
phi
[
7
])
*
0x100000000
);
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
*
2
]
+=
(
long
long
)
(
0.5
f
*
EPSILON_FACTOR
*
(
multipole
[
2
]
*
xscale
*
phi
dp
[
1
]
-
multipole
[
1
]
*
yscale
*
phi
dp
[
2
]
+
2
*
(
multipole
[
5
]
-
multipole
[
4
])
*
xscale
*
yscale
*
phi
dp
[
7
]
+
multipole
[
7
]
*
xscale
*
xscale
*
phi
dp
[
4
]
+
multipole
[
9
]
*
xscale
*
zscale
*
phi
dp
[
8
]
-
multipole
[
7
]
*
yscale
*
yscale
*
phi
dp
[
5
]
-
multipole
[
8
]
*
yscale
*
zscale
*
phi
dp
[
9
])
*
0x100000000
);
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
*
2
]
+=
(
long
long
)
(
0.5
f
*
EPSILON_FACTOR
*
(
multipole
[
2
]
*
c
phi
[
1
]
-
multipole
[
1
]
*
c
phi
[
2
]
+
2
*
(
multipole
[
5
]
-
multipole
[
4
])
*
c
phi
[
7
]
+
multipole
[
7
]
*
c
phi
[
4
]
+
multipole
[
9
]
*
c
phi
[
8
]
-
multipole
[
7
]
*
c
phi
[
5
]
-
multipole
[
8
]
*
c
phi
[
9
])
*
0x100000000
);
// Compute the force and energy.
...
...
@@ -981,6 +1044,7 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
#endif
}
const
real
*
phidp
=
&
phidp_global
[
20
*
i
];
for
(
int
k
=
0
;
k
<
10
;
k
++
)
{
f
.
x
+=
multipole
[
k
]
*
phidp
[
deriv1
[
k
]];
f
.
y
+=
multipole
[
k
]
*
phidp
[
deriv2
[
k
]];
...
...
@@ -998,15 +1062,25 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
extern
"C"
__global__
void
recordInducedFieldDipoles
(
const
real
*
__restrict__
phid
,
real
*
const
__restrict__
phip
,
long
long
*
__restrict__
inducedField
,
long
long
*
__restrict__
inducedFieldPolar
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real
xscale
=
GRID_SIZE_X
*
recipBoxVecX
.
x
*
0x100000000
;
real
yscale
=
GRID_SIZE_Y
*
recipBoxVecY
.
y
*
0x100000000
;
real
zscale
=
GRID_SIZE_Z
*
recipBoxVecZ
.
z
*
0x100000000
;
__shared__
real
fracToCart
[
3
][
3
];
if
(
threadIdx
.
x
==
0
)
{
fracToCart
[
0
][
0
]
=
GRID_SIZE_X
*
recipBoxVecX
.
x
;
fracToCart
[
1
][
0
]
=
GRID_SIZE_X
*
recipBoxVecY
.
x
;
fracToCart
[
2
][
0
]
=
GRID_SIZE_X
*
recipBoxVecZ
.
x
;
fracToCart
[
0
][
1
]
=
GRID_SIZE_Y
*
recipBoxVecX
.
y
;
fracToCart
[
1
][
1
]
=
GRID_SIZE_Y
*
recipBoxVecY
.
y
;
fracToCart
[
2
][
1
]
=
GRID_SIZE_Y
*
recipBoxVecZ
.
y
;
fracToCart
[
0
][
2
]
=
GRID_SIZE_Z
*
recipBoxVecX
.
z
;
fracToCart
[
1
][
2
]
=
GRID_SIZE_Z
*
recipBoxVecY
.
z
;
fracToCart
[
2
][
2
]
=
GRID_SIZE_Z
*
recipBoxVecZ
.
z
;
}
__syncthreads
();
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
inducedField
[
i
]
-=
(
long
long
)
(
xscale
*
phid
[
10
*
i
+
1
]
);
inducedField
[
i
+
PADDED_NUM_ATOMS
]
-=
(
long
long
)
(
yscale
*
phid
[
10
*
i
+
2
]);
inducedField
[
i
+
PADDED_NUM_ATOMS
*
2
]
-=
(
long
long
)
(
zscale
*
phid
[
10
*
i
+
3
]
);
inducedFieldPolar
[
i
]
-=
(
long
long
)
(
xscale
*
phip
[
10
*
i
+
1
]
);
inducedFieldPolar
[
i
+
PADDED_NUM_ATOMS
]
-=
(
long
long
)
(
yscale
*
phip
[
10
*
i
+
2
]);
inducedFieldPolar
[
i
+
PADDED_NUM_ATOMS
*
2
]
-=
(
long
long
)
(
zscale
*
phip
[
10
*
i
+
3
]
);
inducedField
[
i
]
-=
(
long
long
)
(
0x100000000
*
(
phid
[
10
*
i
+
1
]
*
fracToCart
[
0
][
0
]
+
phid
[
10
*
i
+
2
]
*
fracToCart
[
0
][
1
]
+
phid
[
10
*
i
+
3
]
*
fracToCart
[
0
][
2
])
);
inducedField
[
i
+
PADDED_NUM_ATOMS
]
-=
(
long
long
)
(
0x100000000
*
(
phid
[
10
*
i
+
1
]
*
fracToCart
[
1
][
0
]
+
phid
[
10
*
i
+
2
]
*
fracToCart
[
1
][
1
]
+
phid
[
10
*
i
+
3
]
*
fracToCart
[
1
][
2
])
)
;
inducedField
[
i
+
PADDED_NUM_ATOMS
*
2
]
-=
(
long
long
)
(
0x100000000
*
(
phid
[
10
*
i
+
1
]
*
fracToCart
[
2
][
0
]
+
phid
[
10
*
i
+
2
]
*
fracToCart
[
2
][
1
]
+
phid
[
10
*
i
+
3
]
*
fracToCart
[
2
][
2
])
);
inducedFieldPolar
[
i
]
-=
(
long
long
)
(
0x100000000
*
(
phip
[
10
*
i
+
1
]
*
fracToCart
[
0
][
0
]
+
phip
[
10
*
i
+
2
]
*
fracToCart
[
0
][
1
]
+
phip
[
10
*
i
+
3
]
*
fracToCart
[
0
][
2
])
);
inducedFieldPolar
[
i
+
PADDED_NUM_ATOMS
]
-=
(
long
long
)
(
0x100000000
*
(
phip
[
10
*
i
+
1
]
*
fracToCart
[
1
][
0
]
+
phip
[
10
*
i
+
2
]
*
fracToCart
[
1
][
1
]
+
phip
[
10
*
i
+
3
]
*
fracToCart
[
1
][
2
])
)
;
inducedFieldPolar
[
i
+
PADDED_NUM_ATOMS
*
2
]
-=
(
long
long
)
(
0x100000000
*
(
phip
[
10
*
i
+
1
]
*
fracToCart
[
2
][
0
]
+
phip
[
10
*
i
+
2
]
*
fracToCart
[
2
][
1
]
+
phip
[
10
*
i
+
3
]
*
fracToCart
[
2
][
2
])
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment