Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
9e34d39e
Commit
9e34d39e
authored
Sep 08, 2016
by
peastman
Committed by
GitHub
Sep 08, 2016
Browse files
Merge pull request #1583 from peastman/amoebapme
Optimizations to AMOEBA PME
parents
e38c6907
47fe6d40
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
121 additions
and
182 deletions
+121
-182
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+12
-23
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+1
-2
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
+108
-157
No files found.
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
View file @
9e34d39e
...
@@ -822,7 +822,7 @@ CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::stri
...
@@ -822,7 +822,7 @@ CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::stri
inducedDipoleFieldGradientGk
(
NULL
),
inducedDipoleFieldGradientGkPolar
(
NULL
),
extrapolatedDipoleFieldGradient
(
NULL
),
extrapolatedDipoleFieldGradientPolar
(
NULL
),
inducedDipoleFieldGradientGk
(
NULL
),
inducedDipoleFieldGradientGkPolar
(
NULL
),
extrapolatedDipoleFieldGradient
(
NULL
),
extrapolatedDipoleFieldGradientPolar
(
NULL
),
extrapolatedDipoleFieldGradientGk
(
NULL
),
extrapolatedDipoleFieldGradientGkPolar
(
NULL
),
covalentFlags
(
NULL
),
polarizationGroupFlags
(
NULL
),
extrapolatedDipoleFieldGradientGk
(
NULL
),
extrapolatedDipoleFieldGradientGkPolar
(
NULL
),
covalentFlags
(
NULL
),
polarizationGroupFlags
(
NULL
),
pmeGrid
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeIgrid
(
NULL
),
pmePhi
(
NULL
),
pmeGrid
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeIgrid
(
NULL
),
pmePhi
(
NULL
),
pmePhid
(
NULL
),
pmePhip
(
NULL
),
pmePhidp
(
NULL
),
pmeCphi
(
NULL
),
pmeAtomGridIndex
(
NULL
),
lastPositions
(
NULL
),
sort
(
NULL
),
gkKernel
(
NULL
)
{
pmePhid
(
NULL
),
pmePhip
(
NULL
),
pmePhidp
(
NULL
),
pmeCphi
(
NULL
),
lastPositions
(
NULL
),
sort
(
NULL
),
gkKernel
(
NULL
)
{
}
}
CudaCalcAmoebaMultipoleForceKernel
::~
CudaCalcAmoebaMultipoleForceKernel
()
{
CudaCalcAmoebaMultipoleForceKernel
::~
CudaCalcAmoebaMultipoleForceKernel
()
{
...
@@ -927,8 +927,6 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
...
@@ -927,8 +927,6 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
delete
pmePhidp
;
delete
pmePhidp
;
if
(
pmeCphi
!=
NULL
)
if
(
pmeCphi
!=
NULL
)
delete
pmeCphi
;
delete
pmeCphi
;
if
(
pmeAtomGridIndex
!=
NULL
)
delete
pmeAtomGridIndex
;
if
(
lastPositions
!=
NULL
)
if
(
lastPositions
!=
NULL
)
delete
lastPositions
;
delete
lastPositions
;
if
(
sort
!=
NULL
)
if
(
sort
!=
NULL
)
...
@@ -1253,7 +1251,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
...
@@ -1253,7 +1251,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
else
if
(
polarizationType
==
AmoebaMultipoleForce
::
Extrapolated
)
else
if
(
polarizationType
==
AmoebaMultipoleForce
::
Extrapolated
)
pmeDefines
[
"EXTRAPOLATED_POLARIZATION"
]
=
""
;
pmeDefines
[
"EXTRAPOLATED_POLARIZATION"
]
=
""
;
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaAmoebaKernelSources
::
multipolePme
,
pmeDefines
);
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaAmoebaKernelSources
::
multipolePme
,
pmeDefines
);
pmeGridIndexKernel
=
cu
.
getKernel
(
module
,
"findAtomGridIndex"
);
pmeTransformMultipolesKernel
=
cu
.
getKernel
(
module
,
"transformMultipolesToFractionalCoordinates"
);
pmeTransformMultipolesKernel
=
cu
.
getKernel
(
module
,
"transformMultipolesToFractionalCoordinates"
);
pmeTransformPotentialKernel
=
cu
.
getKernel
(
module
,
"transformPotentialToCartesianCoordinates"
);
pmeTransformPotentialKernel
=
cu
.
getKernel
(
module
,
"transformPotentialToCartesianCoordinates"
);
pmeSpreadFixedMultipolesKernel
=
cu
.
getKernel
(
module
,
"gridSpreadFixedMultipoles"
);
pmeSpreadFixedMultipolesKernel
=
cu
.
getKernel
(
module
,
"gridSpreadFixedMultipoles"
);
...
@@ -1285,7 +1282,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
...
@@ -1285,7 +1282,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
pmePhidp
=
new
CudaArray
(
cu
,
20
*
numMultipoles
,
elementSize
,
"pmePhidp"
);
pmePhidp
=
new
CudaArray
(
cu
,
20
*
numMultipoles
,
elementSize
,
"pmePhidp"
);
pmeCphi
=
new
CudaArray
(
cu
,
10
*
numMultipoles
,
elementSize
,
"pmeCphi"
);
pmeCphi
=
new
CudaArray
(
cu
,
10
*
numMultipoles
,
elementSize
,
"pmeCphi"
);
pmeAtomRange
=
CudaArray
::
create
<
int
>
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomRange
=
CudaArray
::
create
<
int
>
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomGridIndex
=
CudaArray
::
create
<
int2
>
(
cu
,
numMultipoles
,
"pmeAtomGridIndex"
);
sort
=
new
CudaSort
(
cu
,
new
SortTrait
(),
cu
.
getNumAtoms
());
sort
=
new
CudaSort
(
cu
,
new
SortTrait
(),
cu
.
getNumAtoms
());
cufftResult
result
=
cufftPlan3d
(
&
fft
,
gridSizeX
,
gridSizeY
,
gridSizeZ
,
cu
.
getUseDoublePrecision
()
?
CUFFT_Z2Z
:
CUFFT_C2C
);
cufftResult
result
=
cufftPlan3d
(
&
fft
,
gridSizeX
,
gridSizeY
,
gridSizeZ
,
cu
.
getUseDoublePrecision
()
?
CUFFT_Z2Z
:
CUFFT_C2C
);
if
(
result
!=
CUFFT_SUCCESS
)
if
(
result
!=
CUFFT_SUCCESS
)
...
@@ -1569,16 +1565,11 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
...
@@ -1569,16 +1565,11 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
// Reciprocal space calculation.
// Reciprocal space calculation.
unsigned
int
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
unsigned
int
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
void
*
gridIndexArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeGridIndexKernel
,
gridIndexArgs
,
cu
.
getNumAtoms
(),
cu
.
ThreadBlockSize
,
cu
.
ThreadBlockSize
*
PmeOrder
*
PmeOrder
*
elementSize
);
sort
->
sort
(
*
pmeAtomGridIndex
);
void
*
pmeTransformMultipolesArgs
[]
=
{
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
void
*
pmeTransformMultipolesArgs
[]
=
{
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
fracDipoles
->
getDevicePointer
(),
&
fracQuadrupoles
->
getDevicePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
&
fracDipoles
->
getDevicePointer
(),
&
fracQuadrupoles
->
getDevicePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeTransformMultipolesKernel
,
pmeTransformMultipolesArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeTransformMultipolesKernel
,
pmeTransformMultipolesArgs
,
cu
.
getNumAtoms
());
void
*
pmeSpreadFixedMultipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
fracDipoles
->
getDevicePointer
(),
&
fracQuadrupoles
->
getDevicePointer
(),
void
*
pmeSpreadFixedMultipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
fracDipoles
->
getDevicePointer
(),
&
fracQuadrupoles
->
getDevicePointer
(),
&
pmeGrid
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
&
pmeGrid
->
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeSpreadFixedMultipolesKernel
,
pmeSpreadFixedMultipolesArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeSpreadFixedMultipolesKernel
,
pmeSpreadFixedMultipolesArgs
,
cu
.
getNumAtoms
());
void
*
finishSpreadArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
()};
void
*
finishSpreadArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
()};
...
@@ -1590,7 +1581,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
...
@@ -1590,7 +1581,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
void
*
pmeConvolutionArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmeBsplineModuliX
->
getDevicePointer
(),
&
pmeBsplineModuliY
->
getDevicePointer
(),
void
*
pmeConvolutionArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmeBsplineModuliX
->
getDevicePointer
(),
&
pmeBsplineModuliY
->
getDevicePointer
(),
&
pmeBsplineModuliZ
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
&
pmeBsplineModuliZ
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeConvolutionKernel
,
pmeConvolutionArgs
,
cu
.
getNumAtoms
()
);
cu
.
executeKernel
(
pmeConvolutionKernel
,
pmeConvolutionArgs
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
256
);
if
(
cu
.
getUseDoublePrecision
())
if
(
cu
.
getUseDoublePrecision
())
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
else
else
...
@@ -1598,7 +1589,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
...
@@ -1598,7 +1589,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
void
*
pmeFixedPotentialArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmePhi
->
getDevicePointer
(),
&
field
->
getDevicePointer
(),
void
*
pmeFixedPotentialArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmePhi
->
getDevicePointer
(),
&
field
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]
,
&
pmeAtomGridIndex
->
getDevicePointer
()
};
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeFixedPotentialKernel
,
pmeFixedPotentialArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeFixedPotentialKernel
,
pmeFixedPotentialArgs
,
cu
.
getNumAtoms
());
void
*
pmeTransformFixedPotentialArgs
[]
=
{
&
pmePhi
->
getDevicePointer
(),
&
pmeCphi
->
getDevicePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
void
*
pmeTransformFixedPotentialArgs
[]
=
{
&
pmePhi
->
getDevicePointer
(),
&
pmeCphi
->
getDevicePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeTransformPotentialKernel
,
pmeTransformFixedPotentialArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeTransformPotentialKernel
,
pmeTransformFixedPotentialArgs
,
cu
.
getNumAtoms
());
...
@@ -1625,7 +1616,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
...
@@ -1625,7 +1616,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cu
.
clearBuffer
(
*
pmeGrid
);
cu
.
clearBuffer
(
*
pmeGrid
);
void
*
pmeSpreadInducedDipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
void
*
pmeSpreadInducedDipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
pmeGrid
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
&
pmeGrid
->
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeSpreadInducedDipolesKernel
,
pmeSpreadInducedDipolesArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeSpreadInducedDipolesKernel
,
pmeSpreadInducedDipolesArgs
,
cu
.
getNumAtoms
());
if
(
cu
.
getUseDoublePrecision
())
if
(
cu
.
getUseDoublePrecision
())
...
@@ -1634,15 +1625,14 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
...
@@ -1634,15 +1625,14 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
else
else
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
cu
.
executeKernel
(
pmeConvolutionKernel
,
pmeConvolutionArgs
,
cu
.
getNumAtoms
()
);
cu
.
executeKernel
(
pmeConvolutionKernel
,
pmeConvolutionArgs
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
256
);
if
(
cu
.
getUseDoublePrecision
())
if
(
cu
.
getUseDoublePrecision
())
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
else
else
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
void
*
pmeInducedPotentialArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmePhid
->
getDevicePointer
(),
&
pmePhip
->
getDevicePointer
(),
void
*
pmeInducedPotentialArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmePhid
->
getDevicePointer
(),
&
pmePhip
->
getDevicePointer
(),
&
pmePhidp
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
&
pmePhidp
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
],
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
&
pmeAtomGridIndex
->
getDevicePointer
()};
cu
.
executeKernel
(
pmeInducedPotentialKernel
,
pmeInducedPotentialArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeInducedPotentialKernel
,
pmeInducedPotentialArgs
,
cu
.
getNumAtoms
());
// Iterate until the dipoles converge.
// Iterate until the dipoles converge.
...
@@ -1771,7 +1761,7 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect
...
@@ -1771,7 +1761,7 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect
cu
.
executeKernel
(
computeInducedFieldKernel
,
&
computeInducedFieldArgs
[
0
],
numForceThreadBlocks
*
inducedFieldThreads
,
inducedFieldThreads
);
cu
.
executeKernel
(
computeInducedFieldKernel
,
&
computeInducedFieldArgs
[
0
],
numForceThreadBlocks
*
inducedFieldThreads
,
inducedFieldThreads
);
cu
.
clearBuffer
(
*
pmeGrid
);
cu
.
clearBuffer
(
*
pmeGrid
);
void
*
pmeSpreadInducedDipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
void
*
pmeSpreadInducedDipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
pmeGrid
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
&
pmeGrid
->
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeSpreadInducedDipolesKernel
,
pmeSpreadInducedDipolesArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeSpreadInducedDipolesKernel
,
pmeSpreadInducedDipolesArgs
,
cu
.
getNumAtoms
());
if
(
cu
.
getUseDoublePrecision
())
{
if
(
cu
.
getUseDoublePrecision
())
{
...
@@ -1784,15 +1774,14 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect
...
@@ -1784,15 +1774,14 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
void
*
pmeConvolutionArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmeBsplineModuliX
->
getDevicePointer
(),
&
pmeBsplineModuliY
->
getDevicePointer
(),
void
*
pmeConvolutionArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmeBsplineModuliX
->
getDevicePointer
(),
&
pmeBsplineModuliY
->
getDevicePointer
(),
&
pmeBsplineModuliZ
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
&
pmeBsplineModuliZ
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeConvolutionKernel
,
pmeConvolutionArgs
,
cu
.
getNumAtoms
()
);
cu
.
executeKernel
(
pmeConvolutionKernel
,
pmeConvolutionArgs
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
256
);
if
(
cu
.
getUseDoublePrecision
())
if
(
cu
.
getUseDoublePrecision
())
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
else
else
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
void
*
pmeInducedPotentialArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmePhid
->
getDevicePointer
(),
&
pmePhip
->
getDevicePointer
(),
void
*
pmeInducedPotentialArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmePhid
->
getDevicePointer
(),
&
pmePhip
->
getDevicePointer
(),
&
pmePhidp
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
&
pmePhidp
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
],
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
&
pmeAtomGridIndex
->
getDevicePointer
()};
cu
.
executeKernel
(
pmeInducedPotentialKernel
,
pmeInducedPotentialArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeInducedPotentialKernel
,
pmeInducedPotentialArgs
,
cu
.
getNumAtoms
());
if
(
polarizationType
==
AmoebaMultipoleForce
::
Extrapolated
)
{
if
(
polarizationType
==
AmoebaMultipoleForce
::
Extrapolated
)
{
void
*
pmeRecordInducedFieldDipolesArgs
[]
=
{
&
pmePhid
->
getDevicePointer
(),
&
pmePhip
->
getDevicePointer
(),
void
*
pmeRecordInducedFieldDipolesArgs
[]
=
{
&
pmePhid
->
getDevicePointer
(),
&
pmePhip
->
getDevicePointer
(),
...
@@ -1896,11 +1885,11 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
...
@@ -1896,11 +1885,11 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
void
*
updateInducedFieldArgs
[]
=
{
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
void
*
updateInducedFieldArgs
[]
=
{
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
prevDipoles
->
getDevicePointer
(),
&
prevDipolesPolar
->
getDevicePointer
(),
&
diisCoefficients
->
getDevicePointer
(),
&
numPrev
};
&
prevDipoles
->
getDevicePointer
(),
&
prevDipolesPolar
->
getDevicePointer
(),
&
diisCoefficients
->
getDevicePointer
(),
&
numPrev
};
cu
.
executeKernel
(
updateInducedFieldKernel
,
updateInducedFieldArgs
,
cu
.
getNum
ThreadBlocks
()
*
cu
.
ThreadBlockSize
);
cu
.
executeKernel
(
updateInducedFieldKernel
,
updateInducedFieldArgs
,
3
*
cu
.
getNum
Atoms
(),
256
);
if
(
gkKernel
!=
NULL
)
{
if
(
gkKernel
!=
NULL
)
{
void
*
updateInducedFieldGkArgs
[]
=
{
&
gkKernel
->
getInducedDipoles
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedDipolesPolar
()
->
getDevicePointer
(),
void
*
updateInducedFieldGkArgs
[]
=
{
&
gkKernel
->
getInducedDipoles
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedDipolesPolar
()
->
getDevicePointer
(),
&
prevDipolesGk
->
getDevicePointer
(),
&
prevDipolesGkPolar
->
getDevicePointer
(),
&
diisCoefficients
->
getDevicePointer
(),
&
numPrev
};
&
prevDipolesGk
->
getDevicePointer
(),
&
prevDipolesGkPolar
->
getDevicePointer
(),
&
diisCoefficients
->
getDevicePointer
(),
&
numPrev
};
cu
.
executeKernel
(
updateInducedFieldKernel
,
updateInducedFieldGkArgs
,
cu
.
getNum
ThreadBlocks
()
*
cu
.
ThreadBlockSize
);
cu
.
executeKernel
(
updateInducedFieldKernel
,
updateInducedFieldGkArgs
,
3
*
cu
.
getNum
Atoms
(),
256
);
}
}
return
false
;
return
false
;
}
}
...
...
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
View file @
9e34d39e
...
@@ -465,12 +465,11 @@ private:
...
@@ -465,12 +465,11 @@ private:
CudaArray
*
pmePhidp
;
CudaArray
*
pmePhidp
;
CudaArray
*
pmeCphi
;
CudaArray
*
pmeCphi
;
CudaArray
*
pmeAtomRange
;
CudaArray
*
pmeAtomRange
;
CudaArray
*
pmeAtomGridIndex
;
CudaArray
*
lastPositions
;
CudaArray
*
lastPositions
;
CudaSort
*
sort
;
CudaSort
*
sort
;
cufftHandle
fft
;
cufftHandle
fft
;
CUfunction
computeMomentsKernel
,
recordInducedDipolesKernel
,
computeFixedFieldKernel
,
computeInducedFieldKernel
,
updateInducedFieldKernel
,
electrostaticsKernel
,
mapTorqueKernel
;
CUfunction
computeMomentsKernel
,
recordInducedDipolesKernel
,
computeFixedFieldKernel
,
computeInducedFieldKernel
,
updateInducedFieldKernel
,
electrostaticsKernel
,
mapTorqueKernel
;
CUfunction
pmeGridIndexKernel
,
pmeSpreadFixedMultipolesKernel
,
pmeSpreadInducedDipolesKernel
,
pmeFinishSpreadChargeKernel
,
pmeConvolutionKernel
;
CUfunction
pmeSpreadFixedMultipolesKernel
,
pmeSpreadInducedDipolesKernel
,
pmeFinishSpreadChargeKernel
,
pmeConvolutionKernel
;
CUfunction
pmeFixedPotentialKernel
,
pmeInducedPotentialKernel
,
pmeFixedForceKernel
,
pmeInducedForceKernel
,
pmeRecordInducedFieldDipolesKernel
,
computePotentialKernel
;
CUfunction
pmeFixedPotentialKernel
,
pmeInducedPotentialKernel
,
pmeFixedForceKernel
,
pmeInducedForceKernel
,
pmeRecordInducedFieldDipolesKernel
,
computePotentialKernel
;
CUfunction
recordDIISDipolesKernel
,
buildMatrixKernel
;
CUfunction
recordDIISDipolesKernel
,
buildMatrixKernel
;
CUfunction
initExtrapolatedKernel
,
iterateExtrapolatedKernel
,
computeExtrapolatedKernel
,
addExtrapolatedGradientKernel
;
CUfunction
initExtrapolatedKernel
,
iterateExtrapolatedKernel
,
computeExtrapolatedKernel
,
addExtrapolatedGradientKernel
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
View file @
9e34d39e
...
@@ -69,47 +69,6 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
...
@@ -69,47 +69,6 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
thetai
[
i
-
1
]
=
make_real4
(
ARRAY
(
PME_ORDER
,
i
),
ARRAY
(
PME_ORDER
-
1
,
i
),
ARRAY
(
PME_ORDER
-
2
,
i
),
ARRAY
(
PME_ORDER
-
3
,
i
));
thetai
[
i
-
1
]
=
make_real4
(
ARRAY
(
PME_ORDER
,
i
),
ARRAY
(
PME_ORDER
-
1
,
i
),
ARRAY
(
PME_ORDER
-
2
,
i
),
ARRAY
(
PME_ORDER
-
3
,
i
));
}
}
/**
* Compute the index of the grid point each atom is associated with.
*/
extern
"C"
__global__
void
findAtomGridIndex
(
const
real4
*
__restrict__
posq
,
int2
*
__restrict__
pmeAtomGridIndex
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
real4
pos
=
posq
[
i
];
pos
-=
periodicBoxVecZ
*
floor
(
pos
.
z
*
recipBoxVecZ
.
z
+
0.5
f
);
pos
-=
periodicBoxVecY
*
floor
(
pos
.
y
*
recipBoxVecY
.
z
+
0.5
f
);
pos
-=
periodicBoxVecX
*
floor
(
pos
.
x
*
recipBoxVecX
.
z
+
0.5
f
);
// First axis.
real
w
=
pos
.
x
*
recipBoxVecX
.
x
+
pos
.
y
*
recipBoxVecY
.
x
+
pos
.
z
*
recipBoxVecZ
.
x
;
real
fr
=
GRID_SIZE_X
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
int
ifr
=
(
int
)
fr
;
int
igrid1
=
ifr
-
PME_ORDER
+
1
;
// Second axis.
w
=
pos
.
y
*
recipBoxVecY
.
y
+
pos
.
z
*
recipBoxVecZ
.
y
;
fr
=
GRID_SIZE_Y
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
int
igrid2
=
ifr
-
PME_ORDER
+
1
;
// Third axis.
w
=
pos
.
z
*
recipBoxVecZ
.
z
;
fr
=
GRID_SIZE_Z
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
int
igrid3
=
ifr
-
PME_ORDER
+
1
;
// Record the grid point.
igrid1
+=
(
igrid1
<
0
?
GRID_SIZE_X
:
0
);
igrid2
+=
(
igrid2
<
0
?
GRID_SIZE_Y
:
0
);
igrid3
+=
(
igrid3
<
0
?
GRID_SIZE_Z
:
0
);
pmeAtomGridIndex
[
i
]
=
make_int2
(
i
,
igrid1
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
igrid2
*
GRID_SIZE_Z
+
igrid3
);
}
}
/**
/**
* Convert the fixed multipoles from Cartesian to fractional coordinates.
* Convert the fixed multipoles from Cartesian to fractional coordinates.
*/
*/
...
@@ -209,7 +168,7 @@ extern "C" __global__ void transformPotentialToCartesianCoordinates(const real*
...
@@ -209,7 +168,7 @@ extern "C" __global__ void transformPotentialToCartesianCoordinates(const real*
}
}
extern
"C"
__global__
void
gridSpreadFixedMultipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
fracDipole
,
extern
"C"
__global__
void
gridSpreadFixedMultipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
fracDipole
,
const
real
*
__restrict__
fracQuadrupole
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
const
real
*
__restrict__
fracQuadrupole
,
real2
*
__restrict__
pmeGrid
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
#if __CUDA_ARCH__ < 500
#if __CUDA_ARCH__ < 500
real
array
[
PME_ORDER
*
PME_ORDER
];
real
array
[
PME_ORDER
*
PME_ORDER
];
...
@@ -300,7 +259,7 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
...
@@ -300,7 +259,7 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
}
}
extern
"C"
__global__
void
gridSpreadInducedDipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
inducedDipole
,
extern
"C"
__global__
void
gridSpreadInducedDipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
const
real
*
__restrict__
inducedDipolePolar
,
real2
*
__restrict__
pmeGrid
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
#if __CUDA_ARCH__ < 500
#if __CUDA_ARCH__ < 500
real
array
[
PME_ORDER
*
PME_ORDER
];
real
array
[
PME_ORDER
*
PME_ORDER
];
...
@@ -451,7 +410,7 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
...
@@ -451,7 +410,7 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
extern
"C"
__global__
void
computeFixedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phi
,
extern
"C"
__global__
void
computeFixedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phi
,
long
long
*
__restrict__
fieldBuffers
,
long
long
*
__restrict__
fieldPolarBuffers
,
const
real4
*
__restrict__
posq
,
long
long
*
__restrict__
fieldBuffers
,
long
long
*
__restrict__
fieldPolarBuffers
,
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
labFrameDipole
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
const
real
*
__restrict__
labFrameDipole
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
int2
*
__restrict__
pmeAtomGridIndex
)
{
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
#if __CUDA_ARCH__ < 500
#if __CUDA_ARCH__ < 500
real
array
[
PME_ORDER
*
PME_ORDER
];
real
array
[
PME_ORDER
*
PME_ORDER
];
#else
#else
...
@@ -476,11 +435,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
...
@@ -476,11 +435,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
}
}
__syncthreads
();
__syncthreads
();
// Process the atoms in spatially sorted order. This improves cache performance when loading
for
(
int
m
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
m
<
NUM_ATOMS
;
m
+=
blockDim
.
x
*
gridDim
.
x
)
{
// the grid values.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
m
=
pmeAtomGridIndex
[
i
].
x
;
real4
pos
=
posq
[
m
];
real4
pos
=
posq
[
m
];
pos
-=
periodicBoxVecZ
*
floor
(
pos
.
z
*
recipBoxVecZ
.
z
+
0.5
f
);
pos
-=
periodicBoxVecZ
*
floor
(
pos
.
z
*
recipBoxVecZ
.
z
+
0.5
f
);
pos
-=
periodicBoxVecY
*
floor
(
pos
.
y
*
recipBoxVecY
.
z
+
0.5
f
);
pos
-=
periodicBoxVecY
*
floor
(
pos
.
y
*
recipBoxVecY
.
z
+
0.5
f
);
...
@@ -533,9 +488,9 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
...
@@ -533,9 +488,9 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
real
tuv102
=
0
;
real
tuv102
=
0
;
real
tuv012
=
0
;
real
tuv012
=
0
;
real
tuv111
=
0
;
real
tuv111
=
0
;
for
(
int
i
z
=
0
;
i
z
<
PME_ORDER
;
i
z
++
)
{
for
(
int
i
x
=
0
;
i
x
<
PME_ORDER
;
i
x
++
)
{
int
k
=
igrid
3
+
i
z
-
(
igrid
3
+
i
z
>=
GRID_SIZE_
Z
?
GRID_SIZE_
Z
:
0
);
int
i
=
igrid
1
+
i
x
-
(
igrid
1
+
i
x
>=
GRID_SIZE_
X
?
GRID_SIZE_
X
:
0
);
real4
v
=
theta
3
[
i
z
];
real4
v
=
theta
1
[
i
x
];
real
tu00
=
0
;
real
tu00
=
0
;
real
tu10
=
0
;
real
tu10
=
0
;
real
tu01
=
0
;
real
tu01
=
0
;
...
@@ -550,47 +505,47 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
...
@@ -550,47 +505,47 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
int
j
=
igrid2
+
iy
-
(
igrid2
+
iy
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
int
j
=
igrid2
+
iy
-
(
igrid2
+
iy
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
real4
u
=
theta2
[
iy
];
real4
u
=
theta2
[
iy
];
real4
t
=
make_real4
(
0
,
0
,
0
,
0
);
real4
t
=
make_real4
(
0
,
0
,
0
,
0
);
for
(
int
i
x
=
0
;
i
x
<
PME_ORDER
;
i
x
++
)
{
for
(
int
i
z
=
0
;
i
z
<
PME_ORDER
;
i
z
++
)
{
int
i
=
igrid
1
+
i
x
-
(
igrid
1
+
i
x
>=
GRID_SIZE_
X
?
GRID_SIZE_
X
:
0
);
int
k
=
igrid
3
+
i
z
-
(
igrid
3
+
i
z
>=
GRID_SIZE_
Z
?
GRID_SIZE_
Z
:
0
);
int
gridIndex
=
i
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
j
*
GRID_SIZE_Z
+
k
;
int
gridIndex
=
i
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
j
*
GRID_SIZE_Z
+
k
;
real
tq
=
pmeGrid
[
gridIndex
].
x
;
real
tq
=
pmeGrid
[
gridIndex
].
x
;
real4
tadd
=
theta
1
[
i
x
];
real4
tadd
=
theta
3
[
i
z
];
t
.
x
+=
tq
*
tadd
.
x
;
t
.
x
+=
tq
*
tadd
.
x
;
t
.
y
+=
tq
*
tadd
.
y
;
t
.
y
+=
tq
*
tadd
.
y
;
t
.
z
+=
tq
*
tadd
.
z
;
t
.
z
+=
tq
*
tadd
.
z
;
t
.
w
+=
tq
*
tadd
.
w
;
t
.
w
+=
tq
*
tadd
.
w
;
}
}
tu00
+=
t
.
x
*
u
.
x
;
tu00
+=
u
.
x
*
t
.
x
;
tu10
+=
t
.
y
*
u
.
x
;
tu10
+=
u
.
y
*
t
.
x
;
tu01
+=
t
.
x
*
u
.
y
;
tu01
+=
u
.
x
*
t
.
y
;
tu20
+=
t
.
z
*
u
.
x
;
tu20
+=
u
.
z
*
t
.
x
;
tu11
+=
t
.
y
*
u
.
y
;
tu11
+=
u
.
y
*
t
.
y
;
tu02
+=
t
.
x
*
u
.
z
;
tu02
+=
u
.
x
*
t
.
z
;
tu30
+=
t
.
w
*
u
.
x
;
tu30
+=
u
.
w
*
t
.
x
;
tu21
+=
t
.
z
*
u
.
y
;
tu21
+=
u
.
z
*
t
.
y
;
tu12
+=
t
.
y
*
u
.
z
;
tu12
+=
u
.
y
*
t
.
z
;
tu03
+=
t
.
x
*
u
.
w
;
tu03
+=
u
.
x
*
t
.
w
;
}
}
tuv000
+=
tu00
*
v
.
x
;
tuv000
+=
v
.
x
*
tu00
;
tuv100
+=
tu10
*
v
.
x
;
tuv100
+=
v
.
y
*
tu00
;
tuv010
+=
tu01
*
v
.
x
;
tuv010
+=
v
.
x
*
tu10
;
tuv001
+=
tu00
*
v
.
y
;
tuv001
+=
v
.
x
*
tu01
;
tuv200
+=
tu20
*
v
.
x
;
tuv200
+=
v
.
z
*
tu00
;
tuv020
+=
tu02
*
v
.
x
;
tuv020
+=
v
.
x
*
tu20
;
tuv002
+=
tu00
*
v
.
z
;
tuv002
+=
v
.
x
*
tu02
;
tuv110
+=
tu11
*
v
.
x
;
tuv110
+=
v
.
y
*
tu10
;
tuv101
+=
tu10
*
v
.
y
;
tuv101
+=
v
.
y
*
tu01
;
tuv011
+=
tu01
*
v
.
y
;
tuv011
+=
v
.
x
*
tu11
;
tuv300
+=
tu30
*
v
.
x
;
tuv300
+=
v
.
w
*
tu00
;
tuv030
+=
tu03
*
v
.
x
;
tuv030
+=
v
.
x
*
tu30
;
tuv003
+=
tu00
*
v
.
w
;
tuv003
+=
v
.
x
*
tu03
;
tuv210
+=
tu21
*
v
.
x
;
tuv210
+=
v
.
z
*
tu10
;
tuv201
+=
tu20
*
v
.
y
;
tuv201
+=
v
.
z
*
tu01
;
tuv120
+=
tu12
*
v
.
x
;
tuv120
+=
v
.
y
*
tu20
;
tuv021
+=
tu02
*
v
.
y
;
tuv021
+=
v
.
x
*
tu21
;
tuv102
+=
tu10
*
v
.
z
;
tuv102
+=
v
.
y
*
tu02
;
tuv012
+=
tu01
*
v
.
z
;
tuv012
+=
v
.
x
*
tu12
;
tuv111
+=
tu11
*
v
.
y
;
tuv111
+=
v
.
y
*
tu11
;
}
}
phi
[
m
]
=
tuv000
;
phi
[
m
]
=
tuv000
;
phi
[
m
+
NUM_ATOMS
]
=
tuv100
;
phi
[
m
+
NUM_ATOMS
]
=
tuv100
;
...
@@ -628,7 +583,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
...
@@ -628,7 +583,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
extern
"C"
__global__
void
computeInducedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phid
,
extern
"C"
__global__
void
computeInducedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phid
,
real
*
__restrict__
phip
,
real
*
__restrict__
phidp
,
const
real4
*
__restrict__
posq
,
real
*
__restrict__
phip
,
real
*
__restrict__
phidp
,
const
real4
*
__restrict__
posq
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
int2
*
__restrict__
pmeAtomGridIndex
)
{
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
#if __CUDA_ARCH__ < 500
#if __CUDA_ARCH__ < 500
real
array
[
PME_ORDER
*
PME_ORDER
];
real
array
[
PME_ORDER
*
PME_ORDER
];
#else
#else
...
@@ -640,11 +595,7 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
...
@@ -640,11 +595,7 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real4
theta2
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
// Process the atoms in spatially sorted order. This improves cache performance when loading
for
(
int
m
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
m
<
NUM_ATOMS
;
m
+=
blockDim
.
x
*
gridDim
.
x
)
{
// the grid values.
for
(
int
atom
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
atom
<
NUM_ATOMS
;
atom
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
m
=
pmeAtomGridIndex
[
atom
].
x
;
real4
pos
=
posq
[
m
];
real4
pos
=
posq
[
m
];
pos
-=
periodicBoxVecZ
*
floor
(
pos
.
z
*
recipBoxVecZ
.
z
+
0.5
f
);
pos
-=
periodicBoxVecZ
*
floor
(
pos
.
z
*
recipBoxVecZ
.
z
+
0.5
f
);
pos
-=
periodicBoxVecY
*
floor
(
pos
.
y
*
recipBoxVecY
.
z
+
0.5
f
);
pos
-=
periodicBoxVecY
*
floor
(
pos
.
y
*
recipBoxVecY
.
z
+
0.5
f
);
...
@@ -715,9 +666,9 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
...
@@ -715,9 +666,9 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real
tuv102
=
0
;
real
tuv102
=
0
;
real
tuv012
=
0
;
real
tuv012
=
0
;
real
tuv111
=
0
;
real
tuv111
=
0
;
for
(
int
i
z
=
0
;
i
z
<
PME_ORDER
;
i
z
++
)
{
for
(
int
i
x
=
0
;
i
x
<
PME_ORDER
;
i
x
++
)
{
int
k
=
igrid
3
+
i
z
-
(
igrid
3
+
i
z
>=
GRID_SIZE_
Z
?
GRID_SIZE_
Z
:
0
);
int
i
=
igrid
1
+
i
x
-
(
igrid
1
+
i
x
>=
GRID_SIZE_
X
?
GRID_SIZE_
X
:
0
);
real4
v
=
theta
3
[
i
z
];
real4
v
=
theta
1
[
i
x
];
real
tu00_1
=
0
;
real
tu00_1
=
0
;
real
tu01_1
=
0
;
real
tu01_1
=
0
;
real
tu10_1
=
0
;
real
tu10_1
=
0
;
...
@@ -750,11 +701,11 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
...
@@ -750,11 +701,11 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real
t1_2
=
0
;
real
t1_2
=
0
;
real
t2_2
=
0
;
real
t2_2
=
0
;
real
t3
=
0
;
real
t3
=
0
;
for
(
int
i
x
=
0
;
i
x
<
PME_ORDER
;
i
x
++
)
{
for
(
int
i
z
=
0
;
i
z
<
PME_ORDER
;
i
z
++
)
{
int
i
=
igrid
1
+
i
x
-
(
igrid
1
+
i
x
>=
GRID_SIZE_
X
?
GRID_SIZE_
X
:
0
);
int
k
=
igrid
3
+
i
z
-
(
igrid
3
+
i
z
>=
GRID_SIZE_
Z
?
GRID_SIZE_
Z
:
0
);
int
gridIndex
=
i
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
j
*
GRID_SIZE_Z
+
k
;
int
gridIndex
=
i
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
j
*
GRID_SIZE_Z
+
k
;
real2
tq
=
pmeGrid
[
gridIndex
];
real2
tq
=
pmeGrid
[
gridIndex
];
real4
tadd
=
theta
1
[
i
x
];
real4
tadd
=
theta
3
[
i
z
];
t0_1
+=
tq
.
x
*
tadd
.
x
;
t0_1
+=
tq
.
x
*
tadd
.
x
;
t1_1
+=
tq
.
x
*
tadd
.
y
;
t1_1
+=
tq
.
x
*
tadd
.
y
;
t2_1
+=
tq
.
x
*
tadd
.
z
;
t2_1
+=
tq
.
x
*
tadd
.
z
;
...
@@ -763,70 +714,70 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
...
@@ -763,70 +714,70 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
t2_2
+=
tq
.
y
*
tadd
.
z
;
t2_2
+=
tq
.
y
*
tadd
.
z
;
t3
+=
(
tq
.
x
+
tq
.
y
)
*
tadd
.
w
;
t3
+=
(
tq
.
x
+
tq
.
y
)
*
tadd
.
w
;
}
}
tu00_1
+=
t0_1
*
u
.
x
;
tu00_1
+=
u
.
x
*
t0_1
;
tu10_1
+=
t1_1
*
u
.
x
;
tu10_1
+=
u
.
y
*
t0_1
;
tu01_1
+=
t0_1
*
u
.
y
;
tu01_1
+=
u
.
x
*
t1_1
;
tu20_1
+=
t2_1
*
u
.
x
;
tu20_1
+=
u
.
z
*
t0_1
;
tu11_1
+=
t1_1
*
u
.
y
;
tu11_1
+=
u
.
y
*
t1_1
;
tu02_1
+=
t0_1
*
u
.
z
;
tu02_1
+=
u
.
x
*
t2_1
;
tu00_2
+=
t0_2
*
u
.
x
;
tu00_2
+=
u
.
x
*
t0_2
;
tu10_2
+=
t1_2
*
u
.
x
;
tu10_2
+=
u
.
y
*
t0_2
;
tu01_2
+=
t0_2
*
u
.
y
;
tu01_2
+=
u
.
x
*
t1_2
;
tu20_2
+=
t2_2
*
u
.
x
;
tu20_2
+=
u
.
z
*
t0_2
;
tu11_2
+=
t1_2
*
u
.
y
;
tu11_2
+=
u
.
y
*
t1_2
;
tu02_2
+=
t0_2
*
u
.
z
;
tu02_2
+=
u
.
x
*
t2_2
;
real
t0
=
t0_1
+
t0_2
;
real
t0
=
t0_1
+
t0_2
;
real
t1
=
t1_1
+
t1_2
;
real
t1
=
t1_1
+
t1_2
;
real
t2
=
t2_1
+
t2_2
;
real
t2
=
t2_1
+
t2_2
;
tu00
+=
t0
*
u
.
x
;
tu00
+=
u
.
x
*
t0
;
tu10
+=
t1
*
u
.
x
;
tu10
+=
u
.
y
*
t0
;
tu01
+=
t0
*
u
.
y
;
tu01
+=
u
.
x
*
t1
;
tu20
+=
t2
*
u
.
x
;
tu20
+=
u
.
z
*
t0
;
tu11
+=
t1
*
u
.
y
;
tu11
+=
u
.
y
*
t1
;
tu02
+=
t0
*
u
.
z
;
tu02
+=
u
.
x
*
t2
;
tu30
+=
t3
*
u
.
x
;
tu30
+=
u
.
w
*
t0
;
tu21
+=
t2
*
u
.
y
;
tu21
+=
u
.
z
*
t1
;
tu12
+=
t1
*
u
.
z
;
tu12
+=
u
.
y
*
t2
;
tu03
+=
t0
*
u
.
w
;
tu03
+=
u
.
x
*
t3
;
}
}
tuv100_1
+=
tu
1
0_1
*
v
.
x
;
tuv100_1
+=
v
.
y
*
tu
0
0_1
;
tuv010_1
+=
tu01_1
*
v
.
x
;
tuv010_1
+=
v
.
x
*
tu10_1
;
tuv001_1
+=
tu0
0
_1
*
v
.
y
;
tuv001_1
+=
v
.
x
*
tu0
1
_1
;
tuv200_1
+=
tu
2
0_1
*
v
.
x
;
tuv200_1
+=
v
.
z
*
tu
0
0_1
;
tuv020_1
+=
tu02_1
*
v
.
x
;
tuv020_1
+=
v
.
x
*
tu20_1
;
tuv002_1
+=
tu0
0
_1
*
v
.
z
;
tuv002_1
+=
v
.
x
*
tu0
2
_1
;
tuv110_1
+=
tu1
1
_1
*
v
.
x
;
tuv110_1
+=
v
.
y
*
tu1
0
_1
;
tuv101_1
+=
tu10_1
*
v
.
y
;
tuv101_1
+=
v
.
y
*
tu01_1
;
tuv011_1
+=
tu
0
1_1
*
v
.
y
;
tuv011_1
+=
v
.
x
*
tu
1
1_1
;
tuv100_2
+=
tu
1
0_2
*
v
.
x
;
tuv100_2
+=
v
.
y
*
tu
0
0_2
;
tuv010_2
+=
tu01_2
*
v
.
x
;
tuv010_2
+=
v
.
x
*
tu10_2
;
tuv001_2
+=
tu0
0
_2
*
v
.
y
;
tuv001_2
+=
v
.
x
*
tu0
1
_2
;
tuv200_2
+=
tu
2
0_2
*
v
.
x
;
tuv200_2
+=
v
.
z
*
tu
0
0_2
;
tuv020_2
+=
tu02_2
*
v
.
x
;
tuv020_2
+=
v
.
x
*
tu20_2
;
tuv002_2
+=
tu0
0
_2
*
v
.
z
;
tuv002_2
+=
v
.
x
*
tu0
2
_2
;
tuv110_2
+=
tu1
1
_2
*
v
.
x
;
tuv110_2
+=
v
.
y
*
tu1
0
_2
;
tuv101_2
+=
tu10_2
*
v
.
y
;
tuv101_2
+=
v
.
y
*
tu01_2
;
tuv011_2
+=
tu
0
1_2
*
v
.
y
;
tuv011_2
+=
v
.
x
*
tu
1
1_2
;
tuv000
+=
tu00
*
v
.
x
;
tuv000
+=
v
.
x
*
tu00
;
tuv100
+=
tu10
*
v
.
x
;
tuv100
+=
v
.
y
*
tu00
;
tuv010
+=
tu01
*
v
.
x
;
tuv010
+=
v
.
x
*
tu10
;
tuv001
+=
tu00
*
v
.
y
;
tuv001
+=
v
.
x
*
tu01
;
tuv200
+=
tu20
*
v
.
x
;
tuv200
+=
v
.
z
*
tu00
;
tuv020
+=
tu02
*
v
.
x
;
tuv020
+=
v
.
x
*
tu20
;
tuv002
+=
tu00
*
v
.
z
;
tuv002
+=
v
.
x
*
tu02
;
tuv110
+=
tu11
*
v
.
x
;
tuv110
+=
v
.
y
*
tu10
;
tuv101
+=
tu10
*
v
.
y
;
tuv101
+=
v
.
y
*
tu01
;
tuv011
+=
tu01
*
v
.
y
;
tuv011
+=
v
.
x
*
tu11
;
tuv300
+=
tu30
*
v
.
x
;
tuv300
+=
v
.
w
*
tu00
;
tuv030
+=
tu03
*
v
.
x
;
tuv030
+=
v
.
x
*
tu30
;
tuv003
+=
tu00
*
v
.
w
;
tuv003
+=
v
.
x
*
tu03
;
tuv210
+=
tu21
*
v
.
x
;
tuv210
+=
v
.
z
*
tu10
;
tuv201
+=
tu20
*
v
.
y
;
tuv201
+=
v
.
z
*
tu01
;
tuv120
+=
tu12
*
v
.
x
;
tuv120
+=
v
.
y
*
tu20
;
tuv021
+=
tu02
*
v
.
y
;
tuv021
+=
v
.
x
*
tu21
;
tuv102
+=
tu10
*
v
.
z
;
tuv102
+=
v
.
y
*
tu02
;
tuv012
+=
tu01
*
v
.
z
;
tuv012
+=
v
.
x
*
tu12
;
tuv111
+=
tu11
*
v
.
y
;
tuv111
+=
v
.
y
*
tu11
;
}
}
phid
[
m
]
=
0
;
phid
[
m
]
=
0
;
phid
[
m
+
NUM_ATOMS
]
=
tuv100_1
;
phid
[
m
+
NUM_ATOMS
]
=
tuv100_1
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment