Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
47fe6d40
"...src/ssh:/git@developer.sourcefind.cn:2222/tsoc/openmm.git" did not exist on "792b89eea7566204b476f0c7ef4a9b7e0cce8817"
Commit
47fe6d40
authored
Sep 07, 2016
by
Peter Eastman
Browse files
Optimizations to AMOEBA PME
parent
e38c6907
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
121 additions
and
182 deletions
+121
-182
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+12
-23
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+1
-2
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
+108
-157
No files found.
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
View file @
47fe6d40
...
@@ -822,7 +822,7 @@ CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::stri
...
@@ -822,7 +822,7 @@ CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::stri
inducedDipoleFieldGradientGk
(
NULL
),
inducedDipoleFieldGradientGkPolar
(
NULL
),
extrapolatedDipoleFieldGradient
(
NULL
),
extrapolatedDipoleFieldGradientPolar
(
NULL
),
inducedDipoleFieldGradientGk
(
NULL
),
inducedDipoleFieldGradientGkPolar
(
NULL
),
extrapolatedDipoleFieldGradient
(
NULL
),
extrapolatedDipoleFieldGradientPolar
(
NULL
),
extrapolatedDipoleFieldGradientGk
(
NULL
),
extrapolatedDipoleFieldGradientGkPolar
(
NULL
),
covalentFlags
(
NULL
),
polarizationGroupFlags
(
NULL
),
extrapolatedDipoleFieldGradientGk
(
NULL
),
extrapolatedDipoleFieldGradientGkPolar
(
NULL
),
covalentFlags
(
NULL
),
polarizationGroupFlags
(
NULL
),
pmeGrid
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeIgrid
(
NULL
),
pmePhi
(
NULL
),
pmeGrid
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeIgrid
(
NULL
),
pmePhi
(
NULL
),
pmePhid
(
NULL
),
pmePhip
(
NULL
),
pmePhidp
(
NULL
),
pmeCphi
(
NULL
),
pmeAtomGridIndex
(
NULL
),
lastPositions
(
NULL
),
sort
(
NULL
),
gkKernel
(
NULL
)
{
pmePhid
(
NULL
),
pmePhip
(
NULL
),
pmePhidp
(
NULL
),
pmeCphi
(
NULL
),
lastPositions
(
NULL
),
sort
(
NULL
),
gkKernel
(
NULL
)
{
}
}
CudaCalcAmoebaMultipoleForceKernel
::~
CudaCalcAmoebaMultipoleForceKernel
()
{
CudaCalcAmoebaMultipoleForceKernel
::~
CudaCalcAmoebaMultipoleForceKernel
()
{
...
@@ -927,8 +927,6 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
...
@@ -927,8 +927,6 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
delete
pmePhidp
;
delete
pmePhidp
;
if
(
pmeCphi
!=
NULL
)
if
(
pmeCphi
!=
NULL
)
delete
pmeCphi
;
delete
pmeCphi
;
if
(
pmeAtomGridIndex
!=
NULL
)
delete
pmeAtomGridIndex
;
if
(
lastPositions
!=
NULL
)
if
(
lastPositions
!=
NULL
)
delete
lastPositions
;
delete
lastPositions
;
if
(
sort
!=
NULL
)
if
(
sort
!=
NULL
)
...
@@ -1253,7 +1251,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
...
@@ -1253,7 +1251,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
else
if
(
polarizationType
==
AmoebaMultipoleForce
::
Extrapolated
)
else
if
(
polarizationType
==
AmoebaMultipoleForce
::
Extrapolated
)
pmeDefines
[
"EXTRAPOLATED_POLARIZATION"
]
=
""
;
pmeDefines
[
"EXTRAPOLATED_POLARIZATION"
]
=
""
;
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaAmoebaKernelSources
::
multipolePme
,
pmeDefines
);
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaAmoebaKernelSources
::
multipolePme
,
pmeDefines
);
pmeGridIndexKernel
=
cu
.
getKernel
(
module
,
"findAtomGridIndex"
);
pmeTransformMultipolesKernel
=
cu
.
getKernel
(
module
,
"transformMultipolesToFractionalCoordinates"
);
pmeTransformMultipolesKernel
=
cu
.
getKernel
(
module
,
"transformMultipolesToFractionalCoordinates"
);
pmeTransformPotentialKernel
=
cu
.
getKernel
(
module
,
"transformPotentialToCartesianCoordinates"
);
pmeTransformPotentialKernel
=
cu
.
getKernel
(
module
,
"transformPotentialToCartesianCoordinates"
);
pmeSpreadFixedMultipolesKernel
=
cu
.
getKernel
(
module
,
"gridSpreadFixedMultipoles"
);
pmeSpreadFixedMultipolesKernel
=
cu
.
getKernel
(
module
,
"gridSpreadFixedMultipoles"
);
...
@@ -1285,7 +1282,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
...
@@ -1285,7 +1282,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
pmePhidp
=
new
CudaArray
(
cu
,
20
*
numMultipoles
,
elementSize
,
"pmePhidp"
);
pmePhidp
=
new
CudaArray
(
cu
,
20
*
numMultipoles
,
elementSize
,
"pmePhidp"
);
pmeCphi
=
new
CudaArray
(
cu
,
10
*
numMultipoles
,
elementSize
,
"pmeCphi"
);
pmeCphi
=
new
CudaArray
(
cu
,
10
*
numMultipoles
,
elementSize
,
"pmeCphi"
);
pmeAtomRange
=
CudaArray
::
create
<
int
>
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomRange
=
CudaArray
::
create
<
int
>
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomGridIndex
=
CudaArray
::
create
<
int2
>
(
cu
,
numMultipoles
,
"pmeAtomGridIndex"
);
sort
=
new
CudaSort
(
cu
,
new
SortTrait
(),
cu
.
getNumAtoms
());
sort
=
new
CudaSort
(
cu
,
new
SortTrait
(),
cu
.
getNumAtoms
());
cufftResult
result
=
cufftPlan3d
(
&
fft
,
gridSizeX
,
gridSizeY
,
gridSizeZ
,
cu
.
getUseDoublePrecision
()
?
CUFFT_Z2Z
:
CUFFT_C2C
);
cufftResult
result
=
cufftPlan3d
(
&
fft
,
gridSizeX
,
gridSizeY
,
gridSizeZ
,
cu
.
getUseDoublePrecision
()
?
CUFFT_Z2Z
:
CUFFT_C2C
);
if
(
result
!=
CUFFT_SUCCESS
)
if
(
result
!=
CUFFT_SUCCESS
)
...
@@ -1569,16 +1565,11 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
...
@@ -1569,16 +1565,11 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
// Reciprocal space calculation.
// Reciprocal space calculation.
unsigned
int
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
unsigned
int
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
void
*
gridIndexArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeGridIndexKernel
,
gridIndexArgs
,
cu
.
getNumAtoms
(),
cu
.
ThreadBlockSize
,
cu
.
ThreadBlockSize
*
PmeOrder
*
PmeOrder
*
elementSize
);
sort
->
sort
(
*
pmeAtomGridIndex
);
void
*
pmeTransformMultipolesArgs
[]
=
{
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
void
*
pmeTransformMultipolesArgs
[]
=
{
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
fracDipoles
->
getDevicePointer
(),
&
fracQuadrupoles
->
getDevicePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
&
fracDipoles
->
getDevicePointer
(),
&
fracQuadrupoles
->
getDevicePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeTransformMultipolesKernel
,
pmeTransformMultipolesArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeTransformMultipolesKernel
,
pmeTransformMultipolesArgs
,
cu
.
getNumAtoms
());
void
*
pmeSpreadFixedMultipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
fracDipoles
->
getDevicePointer
(),
&
fracQuadrupoles
->
getDevicePointer
(),
void
*
pmeSpreadFixedMultipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
fracDipoles
->
getDevicePointer
(),
&
fracQuadrupoles
->
getDevicePointer
(),
&
pmeGrid
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
&
pmeGrid
->
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeSpreadFixedMultipolesKernel
,
pmeSpreadFixedMultipolesArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeSpreadFixedMultipolesKernel
,
pmeSpreadFixedMultipolesArgs
,
cu
.
getNumAtoms
());
void
*
finishSpreadArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
()};
void
*
finishSpreadArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
()};
...
@@ -1590,7 +1581,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
...
@@ -1590,7 +1581,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
void
*
pmeConvolutionArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmeBsplineModuliX
->
getDevicePointer
(),
&
pmeBsplineModuliY
->
getDevicePointer
(),
void
*
pmeConvolutionArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmeBsplineModuliX
->
getDevicePointer
(),
&
pmeBsplineModuliY
->
getDevicePointer
(),
&
pmeBsplineModuliZ
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
&
pmeBsplineModuliZ
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeConvolutionKernel
,
pmeConvolutionArgs
,
cu
.
getNumAtoms
()
);
cu
.
executeKernel
(
pmeConvolutionKernel
,
pmeConvolutionArgs
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
256
);
if
(
cu
.
getUseDoublePrecision
())
if
(
cu
.
getUseDoublePrecision
())
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
else
else
...
@@ -1598,7 +1589,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
...
@@ -1598,7 +1589,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
void
*
pmeFixedPotentialArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmePhi
->
getDevicePointer
(),
&
field
->
getDevicePointer
(),
void
*
pmeFixedPotentialArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmePhi
->
getDevicePointer
(),
&
field
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]
,
&
pmeAtomGridIndex
->
getDevicePointer
()
};
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeFixedPotentialKernel
,
pmeFixedPotentialArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeFixedPotentialKernel
,
pmeFixedPotentialArgs
,
cu
.
getNumAtoms
());
void
*
pmeTransformFixedPotentialArgs
[]
=
{
&
pmePhi
->
getDevicePointer
(),
&
pmeCphi
->
getDevicePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
void
*
pmeTransformFixedPotentialArgs
[]
=
{
&
pmePhi
->
getDevicePointer
(),
&
pmeCphi
->
getDevicePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeTransformPotentialKernel
,
pmeTransformFixedPotentialArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeTransformPotentialKernel
,
pmeTransformFixedPotentialArgs
,
cu
.
getNumAtoms
());
...
@@ -1625,7 +1616,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
...
@@ -1625,7 +1616,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cu
.
clearBuffer
(
*
pmeGrid
);
cu
.
clearBuffer
(
*
pmeGrid
);
void
*
pmeSpreadInducedDipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
void
*
pmeSpreadInducedDipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
pmeGrid
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
&
pmeGrid
->
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeSpreadInducedDipolesKernel
,
pmeSpreadInducedDipolesArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeSpreadInducedDipolesKernel
,
pmeSpreadInducedDipolesArgs
,
cu
.
getNumAtoms
());
if
(
cu
.
getUseDoublePrecision
())
if
(
cu
.
getUseDoublePrecision
())
...
@@ -1634,15 +1625,14 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
...
@@ -1634,15 +1625,14 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
else
else
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
cu
.
executeKernel
(
pmeConvolutionKernel
,
pmeConvolutionArgs
,
cu
.
getNumAtoms
()
);
cu
.
executeKernel
(
pmeConvolutionKernel
,
pmeConvolutionArgs
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
256
);
if
(
cu
.
getUseDoublePrecision
())
if
(
cu
.
getUseDoublePrecision
())
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
else
else
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
void
*
pmeInducedPotentialArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmePhid
->
getDevicePointer
(),
&
pmePhip
->
getDevicePointer
(),
void
*
pmeInducedPotentialArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmePhid
->
getDevicePointer
(),
&
pmePhip
->
getDevicePointer
(),
&
pmePhidp
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
&
pmePhidp
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
],
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
&
pmeAtomGridIndex
->
getDevicePointer
()};
cu
.
executeKernel
(
pmeInducedPotentialKernel
,
pmeInducedPotentialArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeInducedPotentialKernel
,
pmeInducedPotentialArgs
,
cu
.
getNumAtoms
());
// Iterate until the dipoles converge.
// Iterate until the dipoles converge.
...
@@ -1771,7 +1761,7 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect
...
@@ -1771,7 +1761,7 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect
cu
.
executeKernel
(
computeInducedFieldKernel
,
&
computeInducedFieldArgs
[
0
],
numForceThreadBlocks
*
inducedFieldThreads
,
inducedFieldThreads
);
cu
.
executeKernel
(
computeInducedFieldKernel
,
&
computeInducedFieldArgs
[
0
],
numForceThreadBlocks
*
inducedFieldThreads
,
inducedFieldThreads
);
cu
.
clearBuffer
(
*
pmeGrid
);
cu
.
clearBuffer
(
*
pmeGrid
);
void
*
pmeSpreadInducedDipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
void
*
pmeSpreadInducedDipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
pmeGrid
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
&
pmeGrid
->
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeSpreadInducedDipolesKernel
,
pmeSpreadInducedDipolesArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeSpreadInducedDipolesKernel
,
pmeSpreadInducedDipolesArgs
,
cu
.
getNumAtoms
());
if
(
cu
.
getUseDoublePrecision
())
{
if
(
cu
.
getUseDoublePrecision
())
{
...
@@ -1784,15 +1774,14 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect
...
@@ -1784,15 +1774,14 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
void
*
pmeConvolutionArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmeBsplineModuliX
->
getDevicePointer
(),
&
pmeBsplineModuliY
->
getDevicePointer
(),
void
*
pmeConvolutionArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmeBsplineModuliX
->
getDevicePointer
(),
&
pmeBsplineModuliY
->
getDevicePointer
(),
&
pmeBsplineModuliZ
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
&
pmeBsplineModuliZ
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeConvolutionKernel
,
pmeConvolutionArgs
,
cu
.
getNumAtoms
()
);
cu
.
executeKernel
(
pmeConvolutionKernel
,
pmeConvolutionArgs
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
256
);
if
(
cu
.
getUseDoublePrecision
())
if
(
cu
.
getUseDoublePrecision
())
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
else
else
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
void
*
pmeInducedPotentialArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmePhid
->
getDevicePointer
(),
&
pmePhip
->
getDevicePointer
(),
void
*
pmeInducedPotentialArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmePhid
->
getDevicePointer
(),
&
pmePhip
->
getDevicePointer
(),
&
pmePhidp
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
&
pmePhidp
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
cu
.
getPeriodicBoxVecXPointer
(),
cu
.
getPeriodicBoxVecYPointer
(),
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
],
cu
.
getPeriodicBoxVecZPointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
&
pmeAtomGridIndex
->
getDevicePointer
()};
cu
.
executeKernel
(
pmeInducedPotentialKernel
,
pmeInducedPotentialArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeInducedPotentialKernel
,
pmeInducedPotentialArgs
,
cu
.
getNumAtoms
());
if
(
polarizationType
==
AmoebaMultipoleForce
::
Extrapolated
)
{
if
(
polarizationType
==
AmoebaMultipoleForce
::
Extrapolated
)
{
void
*
pmeRecordInducedFieldDipolesArgs
[]
=
{
&
pmePhid
->
getDevicePointer
(),
&
pmePhip
->
getDevicePointer
(),
void
*
pmeRecordInducedFieldDipolesArgs
[]
=
{
&
pmePhid
->
getDevicePointer
(),
&
pmePhip
->
getDevicePointer
(),
...
@@ -1896,11 +1885,11 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
...
@@ -1896,11 +1885,11 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
void
*
updateInducedFieldArgs
[]
=
{
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
void
*
updateInducedFieldArgs
[]
=
{
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
prevDipoles
->
getDevicePointer
(),
&
prevDipolesPolar
->
getDevicePointer
(),
&
diisCoefficients
->
getDevicePointer
(),
&
numPrev
};
&
prevDipoles
->
getDevicePointer
(),
&
prevDipolesPolar
->
getDevicePointer
(),
&
diisCoefficients
->
getDevicePointer
(),
&
numPrev
};
cu
.
executeKernel
(
updateInducedFieldKernel
,
updateInducedFieldArgs
,
cu
.
getNum
ThreadBlocks
()
*
cu
.
ThreadBlockSize
);
cu
.
executeKernel
(
updateInducedFieldKernel
,
updateInducedFieldArgs
,
3
*
cu
.
getNum
Atoms
(),
256
);
if
(
gkKernel
!=
NULL
)
{
if
(
gkKernel
!=
NULL
)
{
void
*
updateInducedFieldGkArgs
[]
=
{
&
gkKernel
->
getInducedDipoles
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedDipolesPolar
()
->
getDevicePointer
(),
void
*
updateInducedFieldGkArgs
[]
=
{
&
gkKernel
->
getInducedDipoles
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedDipolesPolar
()
->
getDevicePointer
(),
&
prevDipolesGk
->
getDevicePointer
(),
&
prevDipolesGkPolar
->
getDevicePointer
(),
&
diisCoefficients
->
getDevicePointer
(),
&
numPrev
};
&
prevDipolesGk
->
getDevicePointer
(),
&
prevDipolesGkPolar
->
getDevicePointer
(),
&
diisCoefficients
->
getDevicePointer
(),
&
numPrev
};
cu
.
executeKernel
(
updateInducedFieldKernel
,
updateInducedFieldGkArgs
,
cu
.
getNum
ThreadBlocks
()
*
cu
.
ThreadBlockSize
);
cu
.
executeKernel
(
updateInducedFieldKernel
,
updateInducedFieldGkArgs
,
3
*
cu
.
getNum
Atoms
(),
256
);
}
}
return
false
;
return
false
;
}
}
...
...
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
View file @
47fe6d40
...
@@ -465,12 +465,11 @@ private:
...
@@ -465,12 +465,11 @@ private:
CudaArray
*
pmePhidp
;
CudaArray
*
pmePhidp
;
CudaArray
*
pmeCphi
;
CudaArray
*
pmeCphi
;
CudaArray
*
pmeAtomRange
;
CudaArray
*
pmeAtomRange
;
CudaArray
*
pmeAtomGridIndex
;
CudaArray
*
lastPositions
;
CudaArray
*
lastPositions
;
CudaSort
*
sort
;
CudaSort
*
sort
;
cufftHandle
fft
;
cufftHandle
fft
;
CUfunction
computeMomentsKernel
,
recordInducedDipolesKernel
,
computeFixedFieldKernel
,
computeInducedFieldKernel
,
updateInducedFieldKernel
,
electrostaticsKernel
,
mapTorqueKernel
;
CUfunction
computeMomentsKernel
,
recordInducedDipolesKernel
,
computeFixedFieldKernel
,
computeInducedFieldKernel
,
updateInducedFieldKernel
,
electrostaticsKernel
,
mapTorqueKernel
;
CUfunction
pmeGridIndexKernel
,
pmeSpreadFixedMultipolesKernel
,
pmeSpreadInducedDipolesKernel
,
pmeFinishSpreadChargeKernel
,
pmeConvolutionKernel
;
CUfunction
pmeSpreadFixedMultipolesKernel
,
pmeSpreadInducedDipolesKernel
,
pmeFinishSpreadChargeKernel
,
pmeConvolutionKernel
;
CUfunction
pmeFixedPotentialKernel
,
pmeInducedPotentialKernel
,
pmeFixedForceKernel
,
pmeInducedForceKernel
,
pmeRecordInducedFieldDipolesKernel
,
computePotentialKernel
;
CUfunction
pmeFixedPotentialKernel
,
pmeInducedPotentialKernel
,
pmeFixedForceKernel
,
pmeInducedForceKernel
,
pmeRecordInducedFieldDipolesKernel
,
computePotentialKernel
;
CUfunction
recordDIISDipolesKernel
,
buildMatrixKernel
;
CUfunction
recordDIISDipolesKernel
,
buildMatrixKernel
;
CUfunction
initExtrapolatedKernel
,
iterateExtrapolatedKernel
,
computeExtrapolatedKernel
,
addExtrapolatedGradientKernel
;
CUfunction
initExtrapolatedKernel
,
iterateExtrapolatedKernel
,
computeExtrapolatedKernel
,
addExtrapolatedGradientKernel
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/multipolePme.cu
View file @
47fe6d40
...
@@ -69,47 +69,6 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
...
@@ -69,47 +69,6 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
thetai
[
i
-
1
]
=
make_real4
(
ARRAY
(
PME_ORDER
,
i
),
ARRAY
(
PME_ORDER
-
1
,
i
),
ARRAY
(
PME_ORDER
-
2
,
i
),
ARRAY
(
PME_ORDER
-
3
,
i
));
thetai
[
i
-
1
]
=
make_real4
(
ARRAY
(
PME_ORDER
,
i
),
ARRAY
(
PME_ORDER
-
1
,
i
),
ARRAY
(
PME_ORDER
-
2
,
i
),
ARRAY
(
PME_ORDER
-
3
,
i
));
}
}
/**
* Compute the index of the grid point each atom is associated with.
*/
extern
"C"
__global__
void
findAtomGridIndex
(
const
real4
*
__restrict__
posq
,
int2
*
__restrict__
pmeAtomGridIndex
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
real4
pos
=
posq
[
i
];
pos
-=
periodicBoxVecZ
*
floor
(
pos
.
z
*
recipBoxVecZ
.
z
+
0.5
f
);
pos
-=
periodicBoxVecY
*
floor
(
pos
.
y
*
recipBoxVecY
.
z
+
0.5
f
);
pos
-=
periodicBoxVecX
*
floor
(
pos
.
x
*
recipBoxVecX
.
z
+
0.5
f
);
// First axis.
real
w
=
pos
.
x
*
recipBoxVecX
.
x
+
pos
.
y
*
recipBoxVecY
.
x
+
pos
.
z
*
recipBoxVecZ
.
x
;
real
fr
=
GRID_SIZE_X
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
int
ifr
=
(
int
)
fr
;
int
igrid1
=
ifr
-
PME_ORDER
+
1
;
// Second axis.
w
=
pos
.
y
*
recipBoxVecY
.
y
+
pos
.
z
*
recipBoxVecZ
.
y
;
fr
=
GRID_SIZE_Y
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
int
igrid2
=
ifr
-
PME_ORDER
+
1
;
// Third axis.
w
=
pos
.
z
*
recipBoxVecZ
.
z
;
fr
=
GRID_SIZE_Z
*
(
w
-
(
int
)(
w
+
0.5
f
)
+
0.5
f
);
ifr
=
(
int
)
fr
;
int
igrid3
=
ifr
-
PME_ORDER
+
1
;
// Record the grid point.
igrid1
+=
(
igrid1
<
0
?
GRID_SIZE_X
:
0
);
igrid2
+=
(
igrid2
<
0
?
GRID_SIZE_Y
:
0
);
igrid3
+=
(
igrid3
<
0
?
GRID_SIZE_Z
:
0
);
pmeAtomGridIndex
[
i
]
=
make_int2
(
i
,
igrid1
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
igrid2
*
GRID_SIZE_Z
+
igrid3
);
}
}
/**
/**
* Convert the fixed multipoles from Cartesian to fractional coordinates.
* Convert the fixed multipoles from Cartesian to fractional coordinates.
*/
*/
...
@@ -209,7 +168,7 @@ extern "C" __global__ void transformPotentialToCartesianCoordinates(const real*
...
@@ -209,7 +168,7 @@ extern "C" __global__ void transformPotentialToCartesianCoordinates(const real*
}
}
extern
"C"
__global__
void
gridSpreadFixedMultipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
fracDipole
,
extern
"C"
__global__
void
gridSpreadFixedMultipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
fracDipole
,
const
real
*
__restrict__
fracQuadrupole
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
const
real
*
__restrict__
fracQuadrupole
,
real2
*
__restrict__
pmeGrid
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
#if __CUDA_ARCH__ < 500
#if __CUDA_ARCH__ < 500
real
array
[
PME_ORDER
*
PME_ORDER
];
real
array
[
PME_ORDER
*
PME_ORDER
];
...
@@ -300,7 +259,7 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
...
@@ -300,7 +259,7 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
}
}
extern
"C"
__global__
void
gridSpreadInducedDipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
inducedDipole
,
extern
"C"
__global__
void
gridSpreadInducedDipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
const
real
*
__restrict__
inducedDipolePolar
,
real2
*
__restrict__
pmeGrid
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
#if __CUDA_ARCH__ < 500
#if __CUDA_ARCH__ < 500
real
array
[
PME_ORDER
*
PME_ORDER
];
real
array
[
PME_ORDER
*
PME_ORDER
];
...
@@ -451,7 +410,7 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
...
@@ -451,7 +410,7 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
extern
"C"
__global__
void
computeFixedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phi
,
extern
"C"
__global__
void
computeFixedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phi
,
long
long
*
__restrict__
fieldBuffers
,
long
long
*
__restrict__
fieldPolarBuffers
,
const
real4
*
__restrict__
posq
,
long
long
*
__restrict__
fieldBuffers
,
long
long
*
__restrict__
fieldPolarBuffers
,
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
labFrameDipole
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
const
real
*
__restrict__
labFrameDipole
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
int2
*
__restrict__
pmeAtomGridIndex
)
{
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
#if __CUDA_ARCH__ < 500
#if __CUDA_ARCH__ < 500
real
array
[
PME_ORDER
*
PME_ORDER
];
real
array
[
PME_ORDER
*
PME_ORDER
];
#else
#else
...
@@ -476,11 +435,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
...
@@ -476,11 +435,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
}
}
__syncthreads
();
__syncthreads
();
// Process the atoms in spatially sorted order. This improves cache performance when loading
for
(
int
m
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
m
<
NUM_ATOMS
;
m
+=
blockDim
.
x
*
gridDim
.
x
)
{
// the grid values.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
m
=
pmeAtomGridIndex
[
i
].
x
;
real4
pos
=
posq
[
m
];
real4
pos
=
posq
[
m
];
pos
-=
periodicBoxVecZ
*
floor
(
pos
.
z
*
recipBoxVecZ
.
z
+
0.5
f
);
pos
-=
periodicBoxVecZ
*
floor
(
pos
.
z
*
recipBoxVecZ
.
z
+
0.5
f
);
pos
-=
periodicBoxVecY
*
floor
(
pos
.
y
*
recipBoxVecY
.
z
+
0.5
f
);
pos
-=
periodicBoxVecY
*
floor
(
pos
.
y
*
recipBoxVecY
.
z
+
0.5
f
);
...
@@ -533,9 +488,9 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
...
@@ -533,9 +488,9 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
real
tuv102
=
0
;
real
tuv102
=
0
;
real
tuv012
=
0
;
real
tuv012
=
0
;
real
tuv111
=
0
;
real
tuv111
=
0
;
for
(
int
i
z
=
0
;
i
z
<
PME_ORDER
;
i
z
++
)
{
for
(
int
i
x
=
0
;
i
x
<
PME_ORDER
;
i
x
++
)
{
int
k
=
igrid
3
+
i
z
-
(
igrid
3
+
i
z
>=
GRID_SIZE_
Z
?
GRID_SIZE_
Z
:
0
);
int
i
=
igrid
1
+
i
x
-
(
igrid
1
+
i
x
>=
GRID_SIZE_
X
?
GRID_SIZE_
X
:
0
);
real4
v
=
theta
3
[
i
z
];
real4
v
=
theta
1
[
i
x
];
real
tu00
=
0
;
real
tu00
=
0
;
real
tu10
=
0
;
real
tu10
=
0
;
real
tu01
=
0
;
real
tu01
=
0
;
...
@@ -550,47 +505,47 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
...
@@ -550,47 +505,47 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
int
j
=
igrid2
+
iy
-
(
igrid2
+
iy
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
int
j
=
igrid2
+
iy
-
(
igrid2
+
iy
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
real4
u
=
theta2
[
iy
];
real4
u
=
theta2
[
iy
];
real4
t
=
make_real4
(
0
,
0
,
0
,
0
);
real4
t
=
make_real4
(
0
,
0
,
0
,
0
);
for
(
int
i
x
=
0
;
i
x
<
PME_ORDER
;
i
x
++
)
{
for
(
int
i
z
=
0
;
i
z
<
PME_ORDER
;
i
z
++
)
{
int
i
=
igrid
1
+
i
x
-
(
igrid
1
+
i
x
>=
GRID_SIZE_
X
?
GRID_SIZE_
X
:
0
);
int
k
=
igrid
3
+
i
z
-
(
igrid
3
+
i
z
>=
GRID_SIZE_
Z
?
GRID_SIZE_
Z
:
0
);
int
gridIndex
=
i
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
j
*
GRID_SIZE_Z
+
k
;
int
gridIndex
=
i
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
j
*
GRID_SIZE_Z
+
k
;
real
tq
=
pmeGrid
[
gridIndex
].
x
;
real
tq
=
pmeGrid
[
gridIndex
].
x
;
real4
tadd
=
theta
1
[
i
x
];
real4
tadd
=
theta
3
[
i
z
];
t
.
x
+=
tq
*
tadd
.
x
;
t
.
x
+=
tq
*
tadd
.
x
;
t
.
y
+=
tq
*
tadd
.
y
;
t
.
y
+=
tq
*
tadd
.
y
;
t
.
z
+=
tq
*
tadd
.
z
;
t
.
z
+=
tq
*
tadd
.
z
;
t
.
w
+=
tq
*
tadd
.
w
;
t
.
w
+=
tq
*
tadd
.
w
;
}
}
tu00
+=
t
.
x
*
u
.
x
;
tu00
+=
u
.
x
*
t
.
x
;
tu10
+=
t
.
y
*
u
.
x
;
tu10
+=
u
.
y
*
t
.
x
;
tu01
+=
t
.
x
*
u
.
y
;
tu01
+=
u
.
x
*
t
.
y
;
tu20
+=
t
.
z
*
u
.
x
;
tu20
+=
u
.
z
*
t
.
x
;
tu11
+=
t
.
y
*
u
.
y
;
tu11
+=
u
.
y
*
t
.
y
;
tu02
+=
t
.
x
*
u
.
z
;
tu02
+=
u
.
x
*
t
.
z
;
tu30
+=
t
.
w
*
u
.
x
;
tu30
+=
u
.
w
*
t
.
x
;
tu21
+=
t
.
z
*
u
.
y
;
tu21
+=
u
.
z
*
t
.
y
;
tu12
+=
t
.
y
*
u
.
z
;
tu12
+=
u
.
y
*
t
.
z
;
tu03
+=
t
.
x
*
u
.
w
;
tu03
+=
u
.
x
*
t
.
w
;
}
}
tuv000
+=
tu00
*
v
.
x
;
tuv000
+=
v
.
x
*
tu00
;
tuv100
+=
tu10
*
v
.
x
;
tuv100
+=
v
.
y
*
tu00
;
tuv010
+=
tu01
*
v
.
x
;
tuv010
+=
v
.
x
*
tu10
;
tuv001
+=
tu00
*
v
.
y
;
tuv001
+=
v
.
x
*
tu01
;
tuv200
+=
tu20
*
v
.
x
;
tuv200
+=
v
.
z
*
tu00
;
tuv020
+=
tu02
*
v
.
x
;
tuv020
+=
v
.
x
*
tu20
;
tuv002
+=
tu00
*
v
.
z
;
tuv002
+=
v
.
x
*
tu02
;
tuv110
+=
tu11
*
v
.
x
;
tuv110
+=
v
.
y
*
tu10
;
tuv101
+=
tu10
*
v
.
y
;
tuv101
+=
v
.
y
*
tu01
;
tuv011
+=
tu01
*
v
.
y
;
tuv011
+=
v
.
x
*
tu11
;
tuv300
+=
tu30
*
v
.
x
;
tuv300
+=
v
.
w
*
tu00
;
tuv030
+=
tu03
*
v
.
x
;
tuv030
+=
v
.
x
*
tu30
;
tuv003
+=
tu00
*
v
.
w
;
tuv003
+=
v
.
x
*
tu03
;
tuv210
+=
tu21
*
v
.
x
;
tuv210
+=
v
.
z
*
tu10
;
tuv201
+=
tu20
*
v
.
y
;
tuv201
+=
v
.
z
*
tu01
;
tuv120
+=
tu12
*
v
.
x
;
tuv120
+=
v
.
y
*
tu20
;
tuv021
+=
tu02
*
v
.
y
;
tuv021
+=
v
.
x
*
tu21
;
tuv102
+=
tu10
*
v
.
z
;
tuv102
+=
v
.
y
*
tu02
;
tuv012
+=
tu01
*
v
.
z
;
tuv012
+=
v
.
x
*
tu12
;
tuv111
+=
tu11
*
v
.
y
;
tuv111
+=
v
.
y
*
tu11
;
}
}
phi
[
m
]
=
tuv000
;
phi
[
m
]
=
tuv000
;
phi
[
m
+
NUM_ATOMS
]
=
tuv100
;
phi
[
m
+
NUM_ATOMS
]
=
tuv100
;
...
@@ -628,7 +583,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
...
@@ -628,7 +583,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
extern
"C"
__global__
void
computeInducedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phid
,
extern
"C"
__global__
void
computeInducedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phid
,
real
*
__restrict__
phip
,
real
*
__restrict__
phidp
,
const
real4
*
__restrict__
posq
,
real
*
__restrict__
phip
,
real
*
__restrict__
phidp
,
const
real4
*
__restrict__
posq
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
int2
*
__restrict__
pmeAtomGridIndex
)
{
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
#if __CUDA_ARCH__ < 500
#if __CUDA_ARCH__ < 500
real
array
[
PME_ORDER
*
PME_ORDER
];
real
array
[
PME_ORDER
*
PME_ORDER
];
#else
#else
...
@@ -640,11 +595,7 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
...
@@ -640,11 +595,7 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real4
theta2
[
PME_ORDER
];
real4
theta2
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
real4
theta3
[
PME_ORDER
];
// Process the atoms in spatially sorted order. This improves cache performance when loading
for
(
int
m
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
m
<
NUM_ATOMS
;
m
+=
blockDim
.
x
*
gridDim
.
x
)
{
// the grid values.
for
(
int
atom
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
atom
<
NUM_ATOMS
;
atom
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
m
=
pmeAtomGridIndex
[
atom
].
x
;
real4
pos
=
posq
[
m
];
real4
pos
=
posq
[
m
];
pos
-=
periodicBoxVecZ
*
floor
(
pos
.
z
*
recipBoxVecZ
.
z
+
0.5
f
);
pos
-=
periodicBoxVecZ
*
floor
(
pos
.
z
*
recipBoxVecZ
.
z
+
0.5
f
);
pos
-=
periodicBoxVecY
*
floor
(
pos
.
y
*
recipBoxVecY
.
z
+
0.5
f
);
pos
-=
periodicBoxVecY
*
floor
(
pos
.
y
*
recipBoxVecY
.
z
+
0.5
f
);
...
@@ -715,9 +666,9 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
...
@@ -715,9 +666,9 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real
tuv102
=
0
;
real
tuv102
=
0
;
real
tuv012
=
0
;
real
tuv012
=
0
;
real
tuv111
=
0
;
real
tuv111
=
0
;
for
(
int
i
z
=
0
;
i
z
<
PME_ORDER
;
i
z
++
)
{
for
(
int
i
x
=
0
;
i
x
<
PME_ORDER
;
i
x
++
)
{
int
k
=
igrid
3
+
i
z
-
(
igrid
3
+
i
z
>=
GRID_SIZE_
Z
?
GRID_SIZE_
Z
:
0
);
int
i
=
igrid
1
+
i
x
-
(
igrid
1
+
i
x
>=
GRID_SIZE_
X
?
GRID_SIZE_
X
:
0
);
real4
v
=
theta
3
[
i
z
];
real4
v
=
theta
1
[
i
x
];
real
tu00_1
=
0
;
real
tu00_1
=
0
;
real
tu01_1
=
0
;
real
tu01_1
=
0
;
real
tu10_1
=
0
;
real
tu10_1
=
0
;
...
@@ -750,11 +701,11 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
...
@@ -750,11 +701,11 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real
t1_2
=
0
;
real
t1_2
=
0
;
real
t2_2
=
0
;
real
t2_2
=
0
;
real
t3
=
0
;
real
t3
=
0
;
for
(
int
i
x
=
0
;
i
x
<
PME_ORDER
;
i
x
++
)
{
for
(
int
i
z
=
0
;
i
z
<
PME_ORDER
;
i
z
++
)
{
int
i
=
igrid
1
+
i
x
-
(
igrid
1
+
i
x
>=
GRID_SIZE_
X
?
GRID_SIZE_
X
:
0
);
int
k
=
igrid
3
+
i
z
-
(
igrid
3
+
i
z
>=
GRID_SIZE_
Z
?
GRID_SIZE_
Z
:
0
);
int
gridIndex
=
i
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
j
*
GRID_SIZE_Z
+
k
;
int
gridIndex
=
i
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
j
*
GRID_SIZE_Z
+
k
;
real2
tq
=
pmeGrid
[
gridIndex
];
real2
tq
=
pmeGrid
[
gridIndex
];
real4
tadd
=
theta
1
[
i
x
];
real4
tadd
=
theta
3
[
i
z
];
t0_1
+=
tq
.
x
*
tadd
.
x
;
t0_1
+=
tq
.
x
*
tadd
.
x
;
t1_1
+=
tq
.
x
*
tadd
.
y
;
t1_1
+=
tq
.
x
*
tadd
.
y
;
t2_1
+=
tq
.
x
*
tadd
.
z
;
t2_1
+=
tq
.
x
*
tadd
.
z
;
...
@@ -763,70 +714,70 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
...
@@ -763,70 +714,70 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
t2_2
+=
tq
.
y
*
tadd
.
z
;
t2_2
+=
tq
.
y
*
tadd
.
z
;
t3
+=
(
tq
.
x
+
tq
.
y
)
*
tadd
.
w
;
t3
+=
(
tq
.
x
+
tq
.
y
)
*
tadd
.
w
;
}
}
tu00_1
+=
t0_1
*
u
.
x
;
tu00_1
+=
u
.
x
*
t0_1
;
tu10_1
+=
t1_1
*
u
.
x
;
tu10_1
+=
u
.
y
*
t0_1
;
tu01_1
+=
t0_1
*
u
.
y
;
tu01_1
+=
u
.
x
*
t1_1
;
tu20_1
+=
t2_1
*
u
.
x
;
tu20_1
+=
u
.
z
*
t0_1
;
tu11_1
+=
t1_1
*
u
.
y
;
tu11_1
+=
u
.
y
*
t1_1
;
tu02_1
+=
t0_1
*
u
.
z
;
tu02_1
+=
u
.
x
*
t2_1
;
tu00_2
+=
t0_2
*
u
.
x
;
tu00_2
+=
u
.
x
*
t0_2
;
tu10_2
+=
t1_2
*
u
.
x
;
tu10_2
+=
u
.
y
*
t0_2
;
tu01_2
+=
t0_2
*
u
.
y
;
tu01_2
+=
u
.
x
*
t1_2
;
tu20_2
+=
t2_2
*
u
.
x
;
tu20_2
+=
u
.
z
*
t0_2
;
tu11_2
+=
t1_2
*
u
.
y
;
tu11_2
+=
u
.
y
*
t1_2
;
tu02_2
+=
t0_2
*
u
.
z
;
tu02_2
+=
u
.
x
*
t2_2
;
real
t0
=
t0_1
+
t0_2
;
real
t0
=
t0_1
+
t0_2
;
real
t1
=
t1_1
+
t1_2
;
real
t1
=
t1_1
+
t1_2
;
real
t2
=
t2_1
+
t2_2
;
real
t2
=
t2_1
+
t2_2
;
tu00
+=
t0
*
u
.
x
;
tu00
+=
u
.
x
*
t0
;
tu10
+=
t1
*
u
.
x
;
tu10
+=
u
.
y
*
t0
;
tu01
+=
t0
*
u
.
y
;
tu01
+=
u
.
x
*
t1
;
tu20
+=
t2
*
u
.
x
;
tu20
+=
u
.
z
*
t0
;
tu11
+=
t1
*
u
.
y
;
tu11
+=
u
.
y
*
t1
;
tu02
+=
t0
*
u
.
z
;
tu02
+=
u
.
x
*
t2
;
tu30
+=
t3
*
u
.
x
;
tu30
+=
u
.
w
*
t0
;
tu21
+=
t2
*
u
.
y
;
tu21
+=
u
.
z
*
t1
;
tu12
+=
t1
*
u
.
z
;
tu12
+=
u
.
y
*
t2
;
tu03
+=
t0
*
u
.
w
;
tu03
+=
u
.
x
*
t3
;
}
}
tuv100_1
+=
tu
1
0_1
*
v
.
x
;
tuv100_1
+=
v
.
y
*
tu
0
0_1
;
tuv010_1
+=
tu01_1
*
v
.
x
;
tuv010_1
+=
v
.
x
*
tu10_1
;
tuv001_1
+=
tu0
0
_1
*
v
.
y
;
tuv001_1
+=
v
.
x
*
tu0
1
_1
;
tuv200_1
+=
tu
2
0_1
*
v
.
x
;
tuv200_1
+=
v
.
z
*
tu
0
0_1
;
tuv020_1
+=
tu02_1
*
v
.
x
;
tuv020_1
+=
v
.
x
*
tu20_1
;
tuv002_1
+=
tu0
0
_1
*
v
.
z
;
tuv002_1
+=
v
.
x
*
tu0
2
_1
;
tuv110_1
+=
tu1
1
_1
*
v
.
x
;
tuv110_1
+=
v
.
y
*
tu1
0
_1
;
tuv101_1
+=
tu10_1
*
v
.
y
;
tuv101_1
+=
v
.
y
*
tu01_1
;
tuv011_1
+=
tu
0
1_1
*
v
.
y
;
tuv011_1
+=
v
.
x
*
tu
1
1_1
;
tuv100_2
+=
tu
1
0_2
*
v
.
x
;
tuv100_2
+=
v
.
y
*
tu
0
0_2
;
tuv010_2
+=
tu01_2
*
v
.
x
;
tuv010_2
+=
v
.
x
*
tu10_2
;
tuv001_2
+=
tu0
0
_2
*
v
.
y
;
tuv001_2
+=
v
.
x
*
tu0
1
_2
;
tuv200_2
+=
tu
2
0_2
*
v
.
x
;
tuv200_2
+=
v
.
z
*
tu
0
0_2
;
tuv020_2
+=
tu02_2
*
v
.
x
;
tuv020_2
+=
v
.
x
*
tu20_2
;
tuv002_2
+=
tu0
0
_2
*
v
.
z
;
tuv002_2
+=
v
.
x
*
tu0
2
_2
;
tuv110_2
+=
tu1
1
_2
*
v
.
x
;
tuv110_2
+=
v
.
y
*
tu1
0
_2
;
tuv101_2
+=
tu10_2
*
v
.
y
;
tuv101_2
+=
v
.
y
*
tu01_2
;
tuv011_2
+=
tu
0
1_2
*
v
.
y
;
tuv011_2
+=
v
.
x
*
tu
1
1_2
;
tuv000
+=
tu00
*
v
.
x
;
tuv000
+=
v
.
x
*
tu00
;
tuv100
+=
tu10
*
v
.
x
;
tuv100
+=
v
.
y
*
tu00
;
tuv010
+=
tu01
*
v
.
x
;
tuv010
+=
v
.
x
*
tu10
;
tuv001
+=
tu00
*
v
.
y
;
tuv001
+=
v
.
x
*
tu01
;
tuv200
+=
tu20
*
v
.
x
;
tuv200
+=
v
.
z
*
tu00
;
tuv020
+=
tu02
*
v
.
x
;
tuv020
+=
v
.
x
*
tu20
;
tuv002
+=
tu00
*
v
.
z
;
tuv002
+=
v
.
x
*
tu02
;
tuv110
+=
tu11
*
v
.
x
;
tuv110
+=
v
.
y
*
tu10
;
tuv101
+=
tu10
*
v
.
y
;
tuv101
+=
v
.
y
*
tu01
;
tuv011
+=
tu01
*
v
.
y
;
tuv011
+=
v
.
x
*
tu11
;
tuv300
+=
tu30
*
v
.
x
;
tuv300
+=
v
.
w
*
tu00
;
tuv030
+=
tu03
*
v
.
x
;
tuv030
+=
v
.
x
*
tu30
;
tuv003
+=
tu00
*
v
.
w
;
tuv003
+=
v
.
x
*
tu03
;
tuv210
+=
tu21
*
v
.
x
;
tuv210
+=
v
.
z
*
tu10
;
tuv201
+=
tu20
*
v
.
y
;
tuv201
+=
v
.
z
*
tu01
;
tuv120
+=
tu12
*
v
.
x
;
tuv120
+=
v
.
y
*
tu20
;
tuv021
+=
tu02
*
v
.
y
;
tuv021
+=
v
.
x
*
tu21
;
tuv102
+=
tu10
*
v
.
z
;
tuv102
+=
v
.
y
*
tu02
;
tuv012
+=
tu01
*
v
.
z
;
tuv012
+=
v
.
x
*
tu12
;
tuv111
+=
tu11
*
v
.
y
;
tuv111
+=
v
.
y
*
tu11
;
}
}
phid
[
m
]
=
0
;
phid
[
m
]
=
0
;
phid
[
m
+
NUM_ATOMS
]
=
tuv100_1
;
phid
[
m
+
NUM_ATOMS
]
=
tuv100_1
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment