Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
f352d116
Commit
f352d116
authored
Aug 18, 2012
by
Peter Eastman
Browse files
Continuing to convert AmoebaMultipoleForce: PME with direct polarization now works
parent
7a60fd73
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
486 additions
and
501 deletions
+486
-501
plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
+47
-56
plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
+2
-1
plugins/amoeba/platforms/cuda2/src/kernels/multipolePme.cu
plugins/amoeba/platforms/cuda2/src/kernels/multipolePme.cu
+437
-444
No files found.
plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
View file @
f352d116
...
@@ -1166,11 +1166,13 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
...
@@ -1166,11 +1166,13 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
pmeUpdateBsplinesKernel
=
cu
.
getKernel
(
module
,
"updateBsplines"
);
pmeUpdateBsplinesKernel
=
cu
.
getKernel
(
module
,
"updateBsplines"
);
pmeAtomRangeKernel
=
cu
.
getKernel
(
module
,
"findAtomRangeForGrid"
);
pmeAtomRangeKernel
=
cu
.
getKernel
(
module
,
"findAtomRangeForGrid"
);
pmeSpreadFixedMultipolesKernel
=
cu
.
getKernel
(
module
,
"gridSpreadFixedMultipoles"
);
pmeSpreadFixedMultipolesKernel
=
cu
.
getKernel
(
module
,
"gridSpreadFixedMultipoles"
);
pmeSpreadInducedDipolesKernel
=
cu
.
getKernel
(
module
,
"gridSpreadInducedDipoles"
);
pmeConvolutionKernel
=
cu
.
getKernel
(
module
,
"reciprocalConvolution"
);
pmeConvolutionKernel
=
cu
.
getKernel
(
module
,
"reciprocalConvolution"
);
pmeFixedPotentialKernel
=
cu
.
getKernel
(
module
,
"computeFixedPotentialFromGrid"
);
pmeFixedPotentialKernel
=
cu
.
getKernel
(
module
,
"computeFixedPotentialFromGrid"
);
pmeInducedPotentialKernel
=
cu
.
getKernel
(
module
,
"computeInducedPotentialFromGrid"
);
pmeFixedForceKernel
=
cu
.
getKernel
(
module
,
"computeFixedMultipoleForceAndEnergy"
);
pmeFixedForceKernel
=
cu
.
getKernel
(
module
,
"computeFixedMultipoleForceAndEnergy"
);
//
pmeIn
terpolate
ForceKernel = cu.getKernel(module, "
gridInter
pol
at
eForce");
pmeIn
duced
ForceKernel
=
cu
.
getKernel
(
module
,
"
computeInducedDi
poleForce
AndEnergy
"
);
//
pme
FinishSpreadCharge
Kernel = cu.getKernel(module, "
finishSpreadCharge
");
pme
RecordInducedFieldDipoles
Kernel
=
cu
.
getKernel
(
module
,
"
recordInducedFieldDipoles
"
);
// cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
// cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
// Create required data structures.
// Create required data structures.
...
@@ -1415,12 +1417,15 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
...
@@ -1415,12 +1417,15 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
dampingAndThole
->
getDevicePointer
()};
&
inducedDipolePolar
->
getDevicePointer
(),
&
dampingAndThole
->
getDevicePointer
()};
cu
.
executeKernel
(
electrostaticsKernel
,
electrostaticsArgs
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
cu
.
executeKernel
(
electrostaticsKernel
,
electrostaticsArgs
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
// Map torques to force.
void
*
mapTorqueArgs
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
torque
->
getDevicePointer
(),
void
*
mapTorqueArgs
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
torque
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
multipoleParticles
->
getDevicePointer
()};
&
cu
.
getPosq
().
getDevicePointer
(),
&
multipoleParticles
->
getDevicePointer
()};
cu
.
executeKernel
(
mapTorqueKernel
,
mapTorqueArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
mapTorqueKernel
,
mapTorqueArgs
,
cu
.
getNumAtoms
());
}
}
else
{
else
{
//
Compute induced dipoles
.
//
Reciprocal space calculation
.
unsigned
int
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
unsigned
int
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
void
*
pmeUpdateBsplinesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
pmeIgrid
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
void
*
pmeUpdateBsplinesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
pmeIgrid
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
...
@@ -1433,9 +1438,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
...
@@ -1433,9 +1438,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cu
.
executeKernel
(
pmeAtomRangeKernel
,
pmeAtomRangeArgs
,
cu
.
getNumAtoms
(),
cu
.
ThreadBlockSize
,
cu
.
ThreadBlockSize
*
PmeOrder
*
PmeOrder
*
elementSize
);
cu
.
executeKernel
(
pmeAtomRangeKernel
,
pmeAtomRangeArgs
,
cu
.
getNumAtoms
(),
cu
.
ThreadBlockSize
,
cu
.
ThreadBlockSize
*
PmeOrder
*
PmeOrder
*
elementSize
);
void
*
pmeSpreadFixedMultipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
void
*
pmeSpreadFixedMultipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
pmeGrid
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
&
pmeAtomRange
->
getDevicePointer
(),
&
pmeGrid
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
&
pmeAtomRange
->
getDevicePointer
(),
&
pmeTheta1
->
getDevicePointer
(),
&
pmeTheta2
->
getDevicePointer
(),
&
pmeTheta3
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
&
pmeTheta1
->
getDevicePointer
(),
&
pmeTheta2
->
getDevicePointer
(),
&
pmeTheta3
->
getDevicePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeSpreadFixedMultipolesKernel
,
pmeSpreadFixedMultipolesArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeSpreadFixedMultipolesKernel
,
pmeSpreadFixedMultipolesArgs
,
cu
.
getNumAtoms
(),
cu
.
ThreadBlockSize
,
cu
.
ThreadBlockSize
*
PmeOrder
*
PmeOrder
*
elementSize
);
if
(
cu
.
getUseDoublePrecision
())
if
(
cu
.
getUseDoublePrecision
())
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
else
else
...
@@ -1448,23 +1452,16 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
...
@@ -1448,23 +1452,16 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
else
else
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
void
*
pmeFixedPotentialArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmePhi
->
getDevicePointer
(),
&
field
->
getDevicePointer
(),
void
*
pmeFixedPotentialArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmePhi
->
getDevicePointer
(),
&
field
->
getDevicePointer
(),
&
pmeIgrid
->
getDevicePointer
(),
&
pmeTheta1
->
getDevicePointer
(),
&
pmeTheta2
->
getDevicePointer
(),
&
pmeTheta3
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
&
pmeIgrid
->
getDevicePointer
(),
&
pmeTheta1
->
getDevicePointer
(),
&
pmeTheta2
->
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
&
pmeTheta3
->
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeFixedPotentialKernel
,
pmeFixedPotentialArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeFixedPotentialKernel
,
pmeFixedPotentialArgs
,
cu
.
getNumAtoms
());
void
*
pmeFixedForceArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
torque
->
getDevicePointer
(),
void
*
pmeFixedForceArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
torque
->
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
pmePhi
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
&
pmePhi
->
getDevicePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeFixedForceKernel
,
pmeFixedForceArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeFixedForceKernel
,
pmeFixedForceArgs
,
cu
.
getNumAtoms
());
printf
(
"reciprocal:
\n
"
);
vector
<
long
long
>
f
;
// Direct space calculation.
printf
(
"force
\n
"
);
cu
.
getForce
().
download
(
f
);
for
(
int
i
=
0
;
i
<
cu
.
getNumAtoms
();
i
++
)
printf
(
"%d: %g %g %g
\n
"
,
i
,
f
[
i
]
/
(
double
)
0xFFFFFFFF
,
f
[
i
+
cu
.
getPaddedNumAtoms
()]
/
(
double
)
0xFFFFFFFF
,
f
[
i
+
cu
.
getPaddedNumAtoms
()
*
2
]
/
(
double
)
0xFFFFFFFF
);
// printf("torque\n");
// torque->download(f);
// for (int i = 0; i < cu.getNumAtoms(); i++)
// printf("%d: %g %g %g\n", i, f[i]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()*2]/(double) 0xFFFFFFFF);
void
*
computeFixedFieldArgs
[]
=
{
&
field
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
void
*
computeFixedFieldArgs
[]
=
{
&
field
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
nb
.
getExclusionIndices
().
getDevicePointer
(),
&
nb
.
getExclusionRowIndices
().
getDevicePointer
(),
&
nb
.
getExclusionIndices
().
getDevicePointer
(),
&
nb
.
getExclusionRowIndices
().
getDevicePointer
(),
&
covalentFlags
->
getDevicePointer
(),
&
polarizationGroupFlags
->
getDevicePointer
(),
&
startTileIndex
,
&
numTileIndices
,
&
covalentFlags
->
getDevicePointer
(),
&
polarizationGroupFlags
->
getDevicePointer
(),
&
startTileIndex
,
&
numTileIndices
,
...
@@ -1475,38 +1472,29 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
...
@@ -1475,38 +1472,29 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
void
*
recordInducedDipolesArgs
[]
=
{
&
field
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
void
*
recordInducedDipolesArgs
[]
=
{
&
field
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
polarizability
->
getDevicePointer
()};
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
polarizability
->
getDevicePointer
()};
cu
.
executeKernel
(
recordInducedDipolesKernel
,
recordInducedDipolesArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
recordInducedDipolesKernel
,
recordInducedDipolesArgs
,
cu
.
getNumAtoms
());
printf
(
"direct:
\n
"
);
printf
(
"force
\n
"
);
// Reciprocal space calculation for the induced dipoles.
cu
.
getForce
().
download
(
f
);
for
(
int
i
=
0
;
i
<
cu
.
getNumAtoms
();
i
++
)
void
*
pmeSpreadInducedDipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
printf
(
"%d: %g %g %g
\n
"
,
i
,
f
[
i
]
/
(
double
)
0xFFFFFFFF
,
f
[
i
+
cu
.
getPaddedNumAtoms
()]
/
(
double
)
0xFFFFFFFF
,
f
[
i
+
cu
.
getPaddedNumAtoms
()
*
2
]
/
(
double
)
0xFFFFFFFF
);
&
pmeGrid
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
&
pmeAtomRange
->
getDevicePointer
(),
// printf("torque\n");
&
pmeTheta1
->
getDevicePointer
(),
&
pmeTheta2
->
getDevicePointer
(),
&
pmeTheta3
->
getDevicePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
// torque->download(f);
cu
.
executeKernel
(
pmeSpreadInducedDipolesKernel
,
pmeSpreadInducedDipolesArgs
,
cu
.
getNumAtoms
());
// for (int i = 0; i < cu.getNumAtoms(); i++)
if
(
cu
.
getUseDoublePrecision
())
// printf("%d: %g %g %g\n", i, f[i]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()*2]/(double) 0xFFFFFFFF);
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
// vector<float> d, dp;
else
// printf("phi\n");
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
// pmePhi->download(d);
cu
.
executeKernel
(
pmeConvolutionKernel
,
pmeConvolutionArgs
,
cu
.
getNumAtoms
());
// for (int i = 0; i < d.size(); i++)
if
(
cu
.
getUseDoublePrecision
())
// printf("%d: %g\n", i, d[i]);
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
// printf("dipoles\n");
else
// labFrameDipoles->download(d);
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
// for (int i = 0; i < cu.getNumAtoms(); i++)
void
*
pmeInducedPotentialArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmePhid
->
getDevicePointer
(),
&
pmePhip
->
getDevicePointer
(),
// printf("%d: %g %g %g\n", i, d[3*i], d[3*i+1], d[3*i+2]);
&
pmePhidp
->
getDevicePointer
(),
&
pmeIgrid
->
getDevicePointer
(),
&
pmeTheta1
->
getDevicePointer
(),
&
pmeTheta2
->
getDevicePointer
(),
// printf("quadrupoles\n");
&
pmeTheta3
->
getDevicePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
// labFrameQuadrupoles->download(d);
cu
.
executeKernel
(
pmeInducedPotentialKernel
,
pmeInducedPotentialArgs
,
cu
.
getNumAtoms
());
// for (int i = 0; i < cu.getNumAtoms(); i++)
// void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
// printf("%d: %g %g %g %g %g %g\n", i, d[5*i], d[5*i+1], d[5*i+2], d[5*i+3], d[5*i+4], -(d[5*i]+d[5*i+3]));
// &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()};
// printf("induced dipoles\n");
// cu.executeKernel(pmeRecordInducedFieldDipolesKernel, pmeRecordInducedFieldDipolesArgs, cu.getNumAtoms());
// inducedDipole->download(d);
// inducedDipolePolar->download(dp);
// for (int i = 0; i < cu.getNumAtoms(); i++)
// printf("%d: %g %g %g, %g %g %g\n", i, d[3*i], d[3*i+1], d[3*i+2], dp[3*i], dp[3*i+1], dp[3*i+2]);
// printf("positions\n");
// vector<float4> p;
// cu.getPosq().download(p);
// for (int i = 0; i < cu.getNumAtoms(); i++)
// printf("%d: %g %g %g %g\n", i, p[i].x, p[i].y, p[i].z, p[i].w);
// vector<float2> errors;
// vector<float2> errors;
...
@@ -1541,11 +1529,14 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
...
@@ -1541,11 +1529,14 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
dampingAndThole
->
getDevicePointer
()};
&
inducedDipolePolar
->
getDevicePointer
(),
&
dampingAndThole
->
getDevicePointer
()};
cu
.
executeKernel
(
electrostaticsKernel
,
electrostaticsArgs
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
cu
.
executeKernel
(
electrostaticsKernel
,
electrostaticsArgs
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
printf
(
"electrostatic:
\n
"
);
void
*
pmeInducedForceArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
torque
->
getDevicePointer
(),
printf
(
"force
\n
"
);
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
cu
.
getForce
().
download
(
f
);
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
pmePhi
->
getDevicePointer
(),
&
pmePhid
->
getDevicePointer
(),
for
(
int
i
=
0
;
i
<
cu
.
getNumAtoms
();
i
++
)
&
pmePhip
->
getDevicePointer
(),
&
pmePhidp
->
getDevicePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
printf
(
"%d: %g %g %g
\n
"
,
i
,
f
[
i
]
/
(
double
)
0xFFFFFFFF
,
f
[
i
+
cu
.
getPaddedNumAtoms
()]
/
(
double
)
0xFFFFFFFF
,
f
[
i
+
cu
.
getPaddedNumAtoms
()
*
2
]
/
(
double
)
0xFFFFFFFF
);
cu
.
executeKernel
(
pmeInducedForceKernel
,
pmeInducedForceArgs
,
cu
.
getNumAtoms
());
// Map torques to force.
void
*
mapTorqueArgs
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
torque
->
getDevicePointer
(),
void
*
mapTorqueArgs
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
torque
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
multipoleParticles
->
getDevicePointer
()};
&
cu
.
getPosq
().
getDevicePointer
(),
&
multipoleParticles
->
getDevicePointer
()};
cu
.
executeKernel
(
mapTorqueKernel
,
mapTorqueArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
mapTorqueKernel
,
mapTorqueArgs
,
cu
.
getNumAtoms
());
...
...
plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
View file @
f352d116
...
@@ -425,7 +425,8 @@ private:
...
@@ -425,7 +425,8 @@ private:
CudaSort
*
sort
;
CudaSort
*
sort
;
cufftHandle
fft
;
cufftHandle
fft
;
CUfunction
computeMomentsKernel
,
recordInducedDipolesKernel
,
computeFixedFieldKernel
,
computeInducedFieldKernel
,
updateInducedFieldKernel
,
electrostaticsKernel
,
mapTorqueKernel
;
CUfunction
computeMomentsKernel
,
recordInducedDipolesKernel
,
computeFixedFieldKernel
,
computeInducedFieldKernel
,
updateInducedFieldKernel
,
electrostaticsKernel
,
mapTorqueKernel
;
CUfunction
pmeUpdateBsplinesKernel
,
pmeAtomRangeKernel
,
pmeSpreadFixedMultipolesKernel
,
pmeConvolutionKernel
,
pmeFixedPotentialKernel
,
pmeFixedForceKernel
;
CUfunction
pmeUpdateBsplinesKernel
,
pmeAtomRangeKernel
,
pmeSpreadFixedMultipolesKernel
,
pmeSpreadInducedDipolesKernel
,
pmeConvolutionKernel
,
pmeFixedPotentialKernel
,
pmeInducedPotentialKernel
;
CUfunction
pmeFixedForceKernel
,
pmeInducedForceKernel
,
pmeRecordInducedFieldDipolesKernel
;
static
const
int
PmeOrder
=
5
;
static
const
int
PmeOrder
=
5
;
};
};
...
...
plugins/amoeba/platforms/cuda2/src/kernels/multipolePme.cu
View file @
f352d116
...
@@ -161,8 +161,7 @@ extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIn
...
@@ -161,8 +161,7 @@ extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIn
}
}
extern
"C"
__global__
void
gridSpreadFixedMultipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
labFrameDipole
,
extern
"C"
__global__
void
gridSpreadFixedMultipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
int
*
__restrict__
pmeAtomRange
,
const
real
*
__restrict__
labFrameQuadrupole
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
int
*
__restrict__
pmeAtomRange
,
const
real4
*
__restrict__
theta1
,
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
const
real4
*
__restrict__
theta1
,
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
real4
invPeriodicBoxSize
)
{
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
const
real
xscale
=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
;
const
real
xscale
=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
;
const
real
yscale
=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
;
const
real
yscale
=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
;
const
real
zscale
=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
;
const
real
zscale
=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
;
...
@@ -248,88 +247,90 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
...
@@ -248,88 +247,90 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
}
}
}
}
//extern "C" __global__ void kGridSpreadInducedDipoles_kernel() {
extern
"C"
__global__
void
gridSpreadInducedDipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
inducedDipole
,
// const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
const
real
*
__restrict__
inducedDipolePolar
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
int
*
__restrict__
pmeAtomRange
,
// const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
const
real4
*
__restrict__
theta1
,
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
real4
invPeriodicBoxSize
)
{
// const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
const
real
xscale
=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
;
// unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
const
real
yscale
=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
;
// unsigned int numThreads = gridDim.x*blockDim.x;
const
real
zscale
=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
;
// for (int gridIndex = blockIdx.x*blockDim.x+threadIdx.x; gridIndex < numGridPoints; gridIndex += numThreads) {
unsigned
int
numGridPoints
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
// int3 gridPoint;
unsigned
int
numThreads
=
gridDim
.
x
*
blockDim
.
x
;
// gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
for
(
int
gridIndex
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
gridIndex
<
numGridPoints
;
gridIndex
+=
numThreads
)
{
// int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
int3
gridPoint
;
// gridPoint.y = remainder/GRID_SIZE_Z;
gridPoint
.
x
=
gridIndex
/
(
GRID_SIZE_Y
*
GRID_SIZE_Z
);
// gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
int
remainder
=
gridIndex
-
gridPoint
.
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
// real2 result = make_real2(0, 0);
gridPoint
.
y
=
remainder
/
GRID_SIZE_Z
;
// for (int ix = 0; ix < PME_ORDER; ++ix) {
gridPoint
.
z
=
remainder
-
gridPoint
.
y
*
GRID_SIZE_Z
;
// int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
real2
result
=
make_real2
(
0
,
0
);
// for (int iy = 0; iy < PME_ORDER; ++iy) {
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
++
ix
)
{
// int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y);
int
x
=
gridPoint
.
x
-
ix
+
(
gridPoint
.
x
>=
ix
?
0
:
GRID_SIZE_X
);
// int z1 = gridPoint.z-PME_ORDER+1;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
++
iy
)
{
// z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
int
y
=
gridPoint
.
y
-
iy
+
(
gridPoint
.
y
>=
iy
?
0
:
GRID_SIZE_Y
);
// int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1);
int
z1
=
gridPoint
.
z
-
PME_ORDER
+
1
;
// int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1;
z1
+=
(
z1
>=
0
?
0
:
GRID_SIZE_Z
);
// int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
int
z2
=
(
z1
<
gridPoint
.
z
?
gridPoint
.
z
:
GRID_SIZE_Z
-
1
);
// int firstAtom = pmeAtomRange[gridIndex1];
int
gridIndex1
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
z1
;
// int lastAtom = pmeAtomRange[gridIndex2+1];
int
gridIndex2
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
z2
;
// for (int i = firstAtom; i < lastAtom; ++i) {
int
firstAtom
=
pmeAtomRange
[
gridIndex1
];
// int2 atomData = pmeAtomGridIndex[i];
int
lastAtom
=
pmeAtomRange
[
gridIndex2
+
1
];
// int atomIndex = atomData.x;
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
// int z = atomData.y;
int2
atomData
=
pmeAtomGridIndex
[
i
];
// int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
int
atomIndex
=
atomData
.
x
;
// if (iz >= GRID_SIZE_Z)
int
z
=
atomData
.
y
;
// iz -= GRID_SIZE_Z;
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
GRID_SIZE_Z
);
// real inducedDipoleX = xscale*cAmoebaSim.pInducedDipole[atomIndex*3];
if
(
iz
>=
GRID_SIZE_Z
)
// real inducedDipoleY = yscale*cAmoebaSim.pInducedDipole[atomIndex*3+1];
iz
-=
GRID_SIZE_Z
;
// real inducedDipoleZ = zscale*cAmoebaSim.pInducedDipole[atomIndex*3+2];
real
inducedDipoleX
=
xscale
*
inducedDipole
[
atomIndex
*
3
];
// real inducedDipolePolarX = xscale*cAmoebaSim.pInducedDipolePolar[atomIndex*3];
real
inducedDipoleY
=
yscale
*
inducedDipole
[
atomIndex
*
3
+
1
];
// real inducedDipolePolarY = yscale*cAmoebaSim.pInducedDipolePolar[atomIndex*3+1];
real
inducedDipoleZ
=
zscale
*
inducedDipole
[
atomIndex
*
3
+
2
];
// real inducedDipolePolarZ = zscale*cAmoebaSim.pInducedDipolePolar[atomIndex*3+2];
real
inducedDipolePolarX
=
xscale
*
inducedDipolePolar
[
atomIndex
*
3
];
// real4 t = theta1[atomIndex*PME_ORDER+ix];
real
inducedDipolePolarY
=
yscale
*
inducedDipolePolar
[
atomIndex
*
3
+
1
];
// real4 u = theta2[atomIndex*PME_ORDER+iy];
real
inducedDipolePolarZ
=
zscale
*
inducedDipolePolar
[
atomIndex
*
3
+
2
];
// real4 v = theta3[atomIndex*PME_ORDER+iz];
real4
t
=
theta1
[
atomIndex
*
PME_ORDER
+
ix
];
// real term01 = inducedDipoleY*u.y*v.x + inducedDipoleZ*u.x*v.y;
real4
u
=
theta2
[
atomIndex
*
PME_ORDER
+
iy
];
// real term11 = inducedDipoleX*u.x*v.x;
real4
v
=
theta3
[
atomIndex
*
PME_ORDER
+
iz
];
// real term02 = inducedDipolePolarY*u.y*v.x + inducedDipolePolarZ*u.x*v.y;
real
term01
=
inducedDipoleY
*
u
.
y
*
v
.
x
+
inducedDipoleZ
*
u
.
x
*
v
.
y
;
// real term12 = inducedDipolePolarX*u.x*v.x;
real
term11
=
inducedDipoleX
*
u
.
x
*
v
.
x
;
// result.x += term01*t.x + term11*t.y;
real
term02
=
inducedDipolePolarY
*
u
.
y
*
v
.
x
+
inducedDipolePolarZ
*
u
.
x
*
v
.
y
;
// result.y += term02*t.x + term12*t.y;
real
term12
=
inducedDipolePolarX
*
u
.
x
*
v
.
x
;
// }
result
.
x
+=
term01
*
t
.
x
+
term11
*
t
.
y
;
// if (z1 > gridPoint.z) {
result
.
y
+=
term02
*
t
.
x
+
term12
*
t
.
y
;
// gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z;
}
// gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z;
if
(
z1
>
gridPoint
.
z
)
{
// firstAtom = pmeAtomRange[gridIndex1];
gridIndex1
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
;
// lastAtom = pmeAtomRange[gridIndex2+1];
gridIndex2
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
gridPoint
.
z
;
// for (int i = firstAtom; i < lastAtom; ++i) {
firstAtom
=
pmeAtomRange
[
gridIndex1
];
// int2 atomData = pmeAtomGridIndex[i];
lastAtom
=
pmeAtomRange
[
gridIndex2
+
1
];
// int atomIndex = atomData.x;
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
// int z = atomData.y;
int2
atomData
=
pmeAtomGridIndex
[
i
];
// int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
int
atomIndex
=
atomData
.
x
;
// if (iz >= GRID_SIZE_Z)
int
z
=
atomData
.
y
;
// iz -= GRID_SIZE_Z;
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
GRID_SIZE_Z
);
// real inducedDipoleX = xscale*cAmoebaSim.pInducedDipole[atomIndex*3];
if
(
iz
>=
GRID_SIZE_Z
)
// real inducedDipoleY = yscale*cAmoebaSim.pInducedDipole[atomIndex*3+1];
iz
-=
GRID_SIZE_Z
;
// real inducedDipoleZ = zscale*cAmoebaSim.pInducedDipole[atomIndex*3+2];
real
inducedDipoleX
=
xscale
*
inducedDipole
[
atomIndex
*
3
];
// real inducedDipolePolarX = xscale*cAmoebaSim.pInducedDipolePolar[atomIndex*3];
real
inducedDipoleY
=
yscale
*
inducedDipole
[
atomIndex
*
3
+
1
];
// real inducedDipolePolarY = yscale*cAmoebaSim.pInducedDipolePolar[atomIndex*3+1];
real
inducedDipoleZ
=
zscale
*
inducedDipole
[
atomIndex
*
3
+
2
];
// real inducedDipolePolarZ = zscale*cAmoebaSim.pInducedDipolePolar[atomIndex*3+2];
real
inducedDipolePolarX
=
xscale
*
inducedDipolePolar
[
atomIndex
*
3
];
// real4 t = theta1[atomIndex*PME_ORDER+ix];
real
inducedDipolePolarY
=
yscale
*
inducedDipolePolar
[
atomIndex
*
3
+
1
];
// real4 u = theta2[atomIndex*PME_ORDER+iy];
real
inducedDipolePolarZ
=
zscale
*
inducedDipolePolar
[
atomIndex
*
3
+
2
];
// real4 v = theta3[atomIndex*PME_ORDER+iz];
real4
t
=
theta1
[
atomIndex
*
PME_ORDER
+
ix
];
// real term01 = inducedDipoleY*u.y*v.x + inducedDipoleZ*u.x*v.y;
real4
u
=
theta2
[
atomIndex
*
PME_ORDER
+
iy
];
// real term11 = inducedDipoleX*u.x*v.x;
real4
v
=
theta3
[
atomIndex
*
PME_ORDER
+
iz
];
// real term02 = inducedDipolePolarY*u.y*v.x + inducedDipolePolarZ*u.x*v.y;
real
term01
=
inducedDipoleY
*
u
.
y
*
v
.
x
+
inducedDipoleZ
*
u
.
x
*
v
.
y
;
// real term12 = inducedDipolePolarX*u.x*v.x;
real
term11
=
inducedDipoleX
*
u
.
x
*
v
.
x
;
// result.x += term01*t.x + term11*t.y;
real
term02
=
inducedDipolePolarY
*
u
.
y
*
v
.
x
+
inducedDipolePolarZ
*
u
.
x
*
v
.
y
;
// result.y += term02*t.x + term12*t.y;
real
term12
=
inducedDipolePolarX
*
u
.
x
*
v
.
x
;
// }
result
.
x
+=
term01
*
t
.
x
+
term11
*
t
.
y
;
// }
result
.
y
+=
term02
*
t
.
x
+
term12
*
t
.
y
;
// }
}
// }
}
// pmeGrid[gridIndex] = result;
}
// }
}
//}
pmeGrid
[
gridIndex
]
=
result
;
//
}
}
extern
"C"
__global__
void
reciprocalConvolution
(
real2
*
__restrict__
pmeGrid
,
const
real
*
__restrict__
pmeBsplineModuliX
,
extern
"C"
__global__
void
reciprocalConvolution
(
real2
*
__restrict__
pmeGrid
,
const
real
*
__restrict__
pmeBsplineModuliX
,
const
real
*
__restrict__
pmeBsplineModuliY
,
const
real
*
__restrict__
pmeBsplineModuliZ
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
const
real
*
__restrict__
pmeBsplineModuliY
,
const
real
*
__restrict__
pmeBsplineModuliZ
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
...
@@ -362,7 +363,7 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
...
@@ -362,7 +363,7 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
}
}
extern
"C"
__global__
void
computeFixedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phi
,
extern
"C"
__global__
void
computeFixedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phi
,
long
long
*
__restrict__
fieldBuffers
,
const
int4
*
__restrict__
igrid
,
const
real4
*
__restrict__
theta1
,
long
long
*
__restrict__
fieldBuffers
,
long
long
*
__restrict__
fieldPolarBuffers
,
const
int4
*
__restrict__
igrid
,
const
real4
*
__restrict__
theta1
,
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
const
real
*
__restrict__
labFrameDipole
,
real4
invPeriodicBoxSize
)
{
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
const
real
*
__restrict__
labFrameDipole
,
real4
invPeriodicBoxSize
)
{
// extract the permanent multipole field at each site
// extract the permanent multipole field at each site
...
@@ -468,216 +469,224 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
...
@@ -468,216 +469,224 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
phi
[
20
*
m
+
18
]
=
tuv012
;
phi
[
20
*
m
+
18
]
=
tuv012
;
phi
[
20
*
m
+
19
]
=
tuv111
;
phi
[
20
*
m
+
19
]
=
tuv111
;
real
dipoleScale
=
(
4
/
(
real
)
3
)
*
(
EWALD_ALPHA
*
EWALD_ALPHA
*
EWALD_ALPHA
)
/
SQRT
(
M_PI
);
real
dipoleScale
=
(
4
/
(
real
)
3
)
*
(
EWALD_ALPHA
*
EWALD_ALPHA
*
EWALD_ALPHA
)
/
SQRT
(
M_PI
);
fieldBuffers
[
m
]
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
]
-
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
*
tuv100
)
*
0xFFFFFFFF
);
long
long
fieldx
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
]
-
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
*
tuv100
)
*
0xFFFFFFFF
);
fieldBuffers
[
m
+
PADDED_NUM_ATOMS
]
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
+
1
]
-
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
*
tuv010
)
*
0xFFFFFFFF
);
fieldBuffers
[
m
]
=
fieldx
;
fieldBuffers
[
m
+
2
*
PADDED_NUM_ATOMS
]
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
+
2
]
-
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
*
tuv001
)
*
0xFFFFFFFF
);
fieldPolarBuffers
[
m
]
=
fieldx
;
long
long
fieldy
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
+
1
]
-
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
*
tuv010
)
*
0xFFFFFFFF
);
fieldBuffers
[
m
+
PADDED_NUM_ATOMS
]
=
fieldy
;
fieldPolarBuffers
[
m
+
PADDED_NUM_ATOMS
]
=
fieldy
;
long
long
fieldz
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
+
2
]
-
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
*
tuv001
)
*
0xFFFFFFFF
);
fieldBuffers
[
m
+
2
*
PADDED_NUM_ATOMS
]
=
fieldz
;
fieldPolarBuffers
[
m
+
2
*
PADDED_NUM_ATOMS
]
=
fieldz
;
}
}
}
}
//extern "C" __global__ void kComputeInducedPotentialFromGrid_kernel() {
extern
"C"
__global__
void
computeInducedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phid
,
// // extract the induced dipole field at each site
real
*
__restrict__
phip
,
real
*
__restrict__
phidp
,
const
int4
*
__restrict__
igrid
,
const
real4
*
__restrict__
theta1
,
//
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
real4
invPeriodicBoxSize
)
{
// for (int m = blockIdx.x*blockDim.x+threadIdx.x; m < NUM_ATOMS; m += blockDim.x*gridDim.x) {
// extract the induced dipole field at each site
// int4 gridPoint = igrid[m];
// real tuv100_1 = 0;
for
(
int
m
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
m
<
NUM_ATOMS
;
m
+=
blockDim
.
x
*
gridDim
.
x
)
{
// real tuv010_1 = 0;
int4
gridPoint
=
igrid
[
m
];
// real tuv001_1 = 0;
real
tuv100_1
=
0
;
// real tuv200_1 = 0;
real
tuv010_1
=
0
;
// real tuv020_1 = 0;
real
tuv001_1
=
0
;
// real tuv002_1 = 0;
real
tuv200_1
=
0
;
// real tuv110_1 = 0;
real
tuv020_1
=
0
;
// real tuv101_1 = 0;
real
tuv002_1
=
0
;
// real tuv011_1 = 0;
real
tuv110_1
=
0
;
// real tuv100_2 = 0;
real
tuv101_1
=
0
;
// real tuv010_2 = 0;
real
tuv011_1
=
0
;
// real tuv001_2 = 0;
real
tuv100_2
=
0
;
// real tuv200_2 = 0;
real
tuv010_2
=
0
;
// real tuv020_2 = 0;
real
tuv001_2
=
0
;
// real tuv002_2 = 0;
real
tuv200_2
=
0
;
// real tuv110_2 = 0;
real
tuv020_2
=
0
;
// real tuv101_2 = 0;
real
tuv002_2
=
0
;
// real tuv011_2 = 0;
real
tuv110_2
=
0
;
// real tuv000 = 0;
real
tuv101_2
=
0
;
// real tuv001 = 0;
real
tuv011_2
=
0
;
// real tuv010 = 0;
real
tuv000
=
0
;
// real tuv100 = 0;
real
tuv001
=
0
;
// real tuv200 = 0;
real
tuv010
=
0
;
// real tuv020 = 0;
real
tuv100
=
0
;
// real tuv002 = 0;
real
tuv200
=
0
;
// real tuv110 = 0;
real
tuv020
=
0
;
// real tuv101 = 0;
real
tuv002
=
0
;
// real tuv011 = 0;
real
tuv110
=
0
;
// real tuv300 = 0;
real
tuv101
=
0
;
// real tuv030 = 0;
real
tuv011
=
0
;
// real tuv003 = 0;
real
tuv300
=
0
;
// real tuv210 = 0;
real
tuv030
=
0
;
// real tuv201 = 0;
real
tuv003
=
0
;
// real tuv120 = 0;
real
tuv210
=
0
;
// real tuv021 = 0;
real
tuv201
=
0
;
// real tuv102 = 0;
real
tuv120
=
0
;
// real tuv012 = 0;
real
tuv021
=
0
;
// real tuv111 = 0;
real
tuv102
=
0
;
// for (int iz = 0; iz < PME_ORDER; iz++) {
real
tuv012
=
0
;
// int k = gridPoint.z+iz-(gridPoint.z+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
real
tuv111
=
0
;
// real4 v = theta3[m*PME_ORDER+iz];
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
// real tu00_1 = 0;
int
k
=
gridPoint
.
z
+
iz
-
(
gridPoint
.
z
+
iz
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
// real tu01_1 = 0;
real4
v
=
theta3
[
m
*
PME_ORDER
+
iz
];
// real tu10_1 = 0;
real
tu00_1
=
0
;
// real tu20_1 = 0;
real
tu01_1
=
0
;
// real tu11_1 = 0;
real
tu10_1
=
0
;
// real tu02_1 = 0;
real
tu20_1
=
0
;
// real tu00_2 = 0;
real
tu11_1
=
0
;
// real tu01_2 = 0;
real
tu02_1
=
0
;
// real tu10_2 = 0;
real
tu00_2
=
0
;
// real tu20_2 = 0;
real
tu01_2
=
0
;
// real tu11_2 = 0;
real
tu10_2
=
0
;
// real tu02_2 = 0;
real
tu20_2
=
0
;
// real tu00 = 0;
real
tu11_2
=
0
;
// real tu10 = 0;
real
tu02_2
=
0
;
// real tu01 = 0;
real
tu00
=
0
;
// real tu20 = 0;
real
tu10
=
0
;
// real tu11 = 0;
real
tu01
=
0
;
// real tu02 = 0;
real
tu20
=
0
;
// real tu30 = 0;
real
tu11
=
0
;
// real tu21 = 0;
real
tu02
=
0
;
// real tu12 = 0;
real
tu30
=
0
;
// real tu03 = 0;
real
tu21
=
0
;
// for (int iy = 0; iy < PME_ORDER; iy++) {
real
tu12
=
0
;
// int j = gridPoint.y+iy-(gridPoint.y+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
real
tu03
=
0
;
// real4 u = theta2[m*PME_ORDER+iy];
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
// real t0_1 = 0;
int
j
=
gridPoint
.
y
+
iy
-
(
gridPoint
.
y
+
iy
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
// real t1_1 = 0;
real4
u
=
theta2
[
m
*
PME_ORDER
+
iy
];
// real t2_1 = 0;
real
t0_1
=
0
;
// real t0_2 = 0;
real
t1_1
=
0
;
// real t1_2 = 0;
real
t2_1
=
0
;
// real t2_2 = 0;
real
t0_2
=
0
;
// real t3 = 0;
real
t1_2
=
0
;
// for (int ix = 0; ix < PME_ORDER; ix++) {
real
t2_2
=
0
;
// int i = gridPoint.x+ix-(gridPoint.x+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
real
t3
=
0
;
// int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k;
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
// real2 tq = pmeGrid[gridIndex];
int
i
=
gridPoint
.
x
+
ix
-
(
gridPoint
.
x
+
ix
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
// real4 tadd = theta1[m*PME_ORDER+ix];
int
gridIndex
=
i
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
j
*
GRID_SIZE_Z
+
k
;
// t0_1 += tq.x*tadd.x;
real2
tq
=
pmeGrid
[
gridIndex
];
// t1_1 += tq.x*tadd.y;
real4
tadd
=
theta1
[
m
*
PME_ORDER
+
ix
];
// t2_1 += tq.x*tadd.z;
t0_1
+=
tq
.
x
*
tadd
.
x
;
// t0_2 += tq.y*tadd.x;
t1_1
+=
tq
.
x
*
tadd
.
y
;
// t1_2 += tq.y*tadd.y;
t2_1
+=
tq
.
x
*
tadd
.
z
;
// t2_2 += tq.y*tadd.z;
t0_2
+=
tq
.
y
*
tadd
.
x
;
// t3 += (tq.x+tq.y)*tadd.w;
t1_2
+=
tq
.
y
*
tadd
.
y
;
// }
t2_2
+=
tq
.
y
*
tadd
.
z
;
// tu00_1 += t0_1*u.x;
t3
+=
(
tq
.
x
+
tq
.
y
)
*
tadd
.
w
;
// tu10_1 += t1_1*u.x;
}
// tu01_1 += t0_1*u.y;
tu00_1
+=
t0_1
*
u
.
x
;
// tu20_1 += t2_1*u.x;
tu10_1
+=
t1_1
*
u
.
x
;
// tu11_1 += t1_1*u.y;
tu01_1
+=
t0_1
*
u
.
y
;
// tu02_1 += t0_1*u.z;
tu20_1
+=
t2_1
*
u
.
x
;
// tu00_2 += t0_2*u.x;
tu11_1
+=
t1_1
*
u
.
y
;
// tu10_2 += t1_2*u.x;
tu02_1
+=
t0_1
*
u
.
z
;
// tu01_2 += t0_2*u.y;
tu00_2
+=
t0_2
*
u
.
x
;
// tu20_2 += t2_2*u.x;
tu10_2
+=
t1_2
*
u
.
x
;
// tu11_2 += t1_2*u.y;
tu01_2
+=
t0_2
*
u
.
y
;
// tu02_2 += t0_2*u.z;
tu20_2
+=
t2_2
*
u
.
x
;
// real t0 = t0_1 + t0_2;
tu11_2
+=
t1_2
*
u
.
y
;
// real t1 = t1_1 + t1_2;
tu02_2
+=
t0_2
*
u
.
z
;
// real t2 = t2_1 + t2_2;
real
t0
=
t0_1
+
t0_2
;
// tu00 += t0*u.x;
real
t1
=
t1_1
+
t1_2
;
// tu10 += t1*u.x;
real
t2
=
t2_1
+
t2_2
;
// tu01 += t0*u.y;
tu00
+=
t0
*
u
.
x
;
// tu20 += t2*u.x;
tu10
+=
t1
*
u
.
x
;
// tu11 += t1*u.y;
tu01
+=
t0
*
u
.
y
;
// tu02 += t0*u.z;
tu20
+=
t2
*
u
.
x
;
// tu30 += t3*u.x;
tu11
+=
t1
*
u
.
y
;
// tu21 += t2*u.y;
tu02
+=
t0
*
u
.
z
;
// tu12 += t1*u.z;
tu30
+=
t3
*
u
.
x
;
// tu03 += t0*u.w;
tu21
+=
t2
*
u
.
y
;
// }
tu12
+=
t1
*
u
.
z
;
// tuv100_1 += tu10_1*v.x;
tu03
+=
t0
*
u
.
w
;
// tuv010_1 += tu01_1*v.x;
}
// tuv001_1 += tu00_1*v.y;
tuv100_1
+=
tu10_1
*
v
.
x
;
// tuv200_1 += tu20_1*v.x;
tuv010_1
+=
tu01_1
*
v
.
x
;
// tuv020_1 += tu02_1*v.x;
tuv001_1
+=
tu00_1
*
v
.
y
;
// tuv002_1 += tu00_1*v.z;
tuv200_1
+=
tu20_1
*
v
.
x
;
// tuv110_1 += tu11_1*v.x;
tuv020_1
+=
tu02_1
*
v
.
x
;
// tuv101_1 += tu10_1*v.y;
tuv002_1
+=
tu00_1
*
v
.
z
;
// tuv011_1 += tu01_1*v.y;
tuv110_1
+=
tu11_1
*
v
.
x
;
// tuv100_2 += tu10_2*v.x;
tuv101_1
+=
tu10_1
*
v
.
y
;
// tuv010_2 += tu01_2*v.x;
tuv011_1
+=
tu01_1
*
v
.
y
;
// tuv001_2 += tu00_2*v.y;
tuv100_2
+=
tu10_2
*
v
.
x
;
// tuv200_2 += tu20_2*v.x;
tuv010_2
+=
tu01_2
*
v
.
x
;
// tuv020_2 += tu02_2*v.x;
tuv001_2
+=
tu00_2
*
v
.
y
;
// tuv002_2 += tu00_2*v.z;
tuv200_2
+=
tu20_2
*
v
.
x
;
// tuv110_2 += tu11_2*v.x;
tuv020_2
+=
tu02_2
*
v
.
x
;
// tuv101_2 += tu10_2*v.y;
tuv002_2
+=
tu00_2
*
v
.
z
;
// tuv011_2 += tu01_2*v.y;
tuv110_2
+=
tu11_2
*
v
.
x
;
// tuv000 += tu00*v.x;
tuv101_2
+=
tu10_2
*
v
.
y
;
// tuv100 += tu10*v.x;
tuv011_2
+=
tu01_2
*
v
.
y
;
// tuv010 += tu01*v.x;
tuv000
+=
tu00
*
v
.
x
;
// tuv001 += tu00*v.y;
tuv100
+=
tu10
*
v
.
x
;
// tuv200 += tu20*v.x;
tuv010
+=
tu01
*
v
.
x
;
// tuv020 += tu02*v.x;
tuv001
+=
tu00
*
v
.
y
;
// tuv002 += tu00*v.z;
tuv200
+=
tu20
*
v
.
x
;
// tuv110 += tu11*v.x;
tuv020
+=
tu02
*
v
.
x
;
// tuv101 += tu10*v.y;
tuv002
+=
tu00
*
v
.
z
;
// tuv011 += tu01*v.y;
tuv110
+=
tu11
*
v
.
x
;
// tuv300 += tu30*v.x;
tuv101
+=
tu10
*
v
.
y
;
// tuv030 += tu03*v.x;
tuv011
+=
tu01
*
v
.
y
;
// tuv003 += tu00*v.w;
tuv300
+=
tu30
*
v
.
x
;
// tuv210 += tu21*v.x;
tuv030
+=
tu03
*
v
.
x
;
// tuv201 += tu20*v.y;
tuv003
+=
tu00
*
v
.
w
;
// tuv120 += tu12*v.x;
tuv210
+=
tu21
*
v
.
x
;
// tuv021 += tu02*v.y;
tuv201
+=
tu20
*
v
.
y
;
// tuv102 += tu10*v.z;
tuv120
+=
tu12
*
v
.
x
;
// tuv012 += tu01*v.z;
tuv021
+=
tu02
*
v
.
y
;
// tuv111 += tu11*v.y;
tuv102
+=
tu10
*
v
.
z
;
// }
tuv012
+=
tu01
*
v
.
z
;
// phid[10*m] = 0;
tuv111
+=
tu11
*
v
.
y
;
// phid[10*m+1] = tuv100_1;
}
// phid[10*m+2] = tuv010_1;
phid
[
10
*
m
]
=
0
;
// phid[10*m+3] = tuv001_1;
phid
[
10
*
m
+
1
]
=
tuv100_1
;
// phid[10*m+4] = tuv200_1;
phid
[
10
*
m
+
2
]
=
tuv010_1
;
// phid[10*m+5] = tuv020_1;
phid
[
10
*
m
+
3
]
=
tuv001_1
;
// phid[10*m+6] = tuv002_1;
phid
[
10
*
m
+
4
]
=
tuv200_1
;
// phid[10*m+7] = tuv110_1;
phid
[
10
*
m
+
5
]
=
tuv020_1
;
// phid[10*m+8] = tuv101_1;
phid
[
10
*
m
+
6
]
=
tuv002_1
;
// phid[10*m+9] = tuv011_1;
phid
[
10
*
m
+
7
]
=
tuv110_1
;
//
phid
[
10
*
m
+
8
]
=
tuv101_1
;
// phip[10*m] = 0;
phid
[
10
*
m
+
9
]
=
tuv011_1
;
// phip[10*m+1] = tuv100_2;
// phip[10*m+2] = tuv010_2;
phip
[
10
*
m
]
=
0
;
// phip[10*m+3] = tuv001_2;
phip
[
10
*
m
+
1
]
=
tuv100_2
;
// phip[10*m+4] = tuv200_2;
phip
[
10
*
m
+
2
]
=
tuv010_2
;
// phip[10*m+5] = tuv020_2;
phip
[
10
*
m
+
3
]
=
tuv001_2
;
// phip[10*m+6] = tuv002_2;
phip
[
10
*
m
+
4
]
=
tuv200_2
;
// phip[10*m+7] = tuv110_2;
phip
[
10
*
m
+
5
]
=
tuv020_2
;
// phip[10*m+8] = tuv101_2;
phip
[
10
*
m
+
6
]
=
tuv002_2
;
// phip[10*m+9] = tuv011_2;
phip
[
10
*
m
+
7
]
=
tuv110_2
;
//
phip
[
10
*
m
+
8
]
=
tuv101_2
;
// phidp[20*m] = tuv000;
phip
[
10
*
m
+
9
]
=
tuv011_2
;
// phidp[20*m+1] = tuv100;
// phidp[20*m+2] = tuv010;
phidp
[
20
*
m
]
=
tuv000
;
// phidp[20*m+3] = tuv001;
phidp
[
20
*
m
+
1
]
=
tuv100
;
// phidp[20*m+4] = tuv200;
phidp
[
20
*
m
+
2
]
=
tuv010
;
// phidp[20*m+5] = tuv020;
phidp
[
20
*
m
+
3
]
=
tuv001
;
// phidp[20*m+6] = tuv002;
phidp
[
20
*
m
+
4
]
=
tuv200
;
// phidp[20*m+7] = tuv110;
phidp
[
20
*
m
+
5
]
=
tuv020
;
// phidp[20*m+8] = tuv101;
phidp
[
20
*
m
+
6
]
=
tuv002
;
// phidp[20*m+9] = tuv011;
phidp
[
20
*
m
+
7
]
=
tuv110
;
// phidp[20*m+10] = tuv300;
phidp
[
20
*
m
+
8
]
=
tuv101
;
// phidp[20*m+11] = tuv030;
phidp
[
20
*
m
+
9
]
=
tuv011
;
// phidp[20*m+12] = tuv003;
phidp
[
20
*
m
+
10
]
=
tuv300
;
// phidp[20*m+13] = tuv210;
phidp
[
20
*
m
+
11
]
=
tuv030
;
// phidp[20*m+14] = tuv201;
phidp
[
20
*
m
+
12
]
=
tuv003
;
// phidp[20*m+15] = tuv120;
phidp
[
20
*
m
+
13
]
=
tuv210
;
// phidp[20*m+16] = tuv021;
phidp
[
20
*
m
+
14
]
=
tuv201
;
// phidp[20*m+17] = tuv102;
phidp
[
20
*
m
+
15
]
=
tuv120
;
// phidp[20*m+18] = tuv012;
phidp
[
20
*
m
+
16
]
=
tuv021
;
// phidp[20*m+19] = tuv111;
phidp
[
20
*
m
+
17
]
=
tuv102
;
// }
phidp
[
20
*
m
+
18
]
=
tuv012
;
//}
phidp
[
20
*
m
+
19
]
=
tuv111
;
}
}
extern
"C"
__global__
void
computeFixedMultipoleForceAndEnergy
(
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
extern
"C"
__global__
void
computeFixedMultipoleForceAndEnergy
(
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
long
long
*
__restrict__
torqueBuffers
,
real
*
__restrict__
energyBuffer
,
const
real
*
__restrict__
labFrameDipole
,
long
long
*
__restrict__
torqueBuffers
,
real
*
__restrict__
energyBuffer
,
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
phi
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
phi
_global
,
real4
invPeriodicBoxSize
)
{
real
multipole
[
10
];
real
multipole
[
10
];
const
int
deriv1
[]
=
{
1
,
4
,
7
,
8
,
10
,
15
,
17
,
13
,
14
,
19
};
const
int
deriv1
[]
=
{
1
,
4
,
7
,
8
,
10
,
15
,
17
,
13
,
14
,
19
};
const
int
deriv2
[]
=
{
2
,
7
,
5
,
9
,
13
,
11
,
18
,
15
,
19
,
16
};
const
int
deriv2
[]
=
{
2
,
7
,
5
,
9
,
13
,
11
,
18
,
15
,
19
,
16
};
...
@@ -700,22 +709,22 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
...
@@ -700,22 +709,22 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
multipole
[
8
]
=
2
*
labFrameQuadrupole
[
i
*
5
+
2
];
multipole
[
8
]
=
2
*
labFrameQuadrupole
[
i
*
5
+
2
];
multipole
[
9
]
=
2
*
labFrameQuadrupole
[
i
*
5
+
4
];
multipole
[
9
]
=
2
*
labFrameQuadrupole
[
i
*
5
+
4
];
const
real
*
atomP
hi
=
&
phi
[
20
*
i
];
const
real
*
p
hi
=
&
phi
_global
[
20
*
i
];
torqueBuffers
[
i
]
=
(
long
long
)
(
EPSILON_FACTOR
*
(
multipole
[
3
]
*
yscale
*
atomP
hi
[
2
]
-
multipole
[
2
]
*
zscale
*
atomP
hi
[
3
]
torqueBuffers
[
i
]
=
(
long
long
)
(
EPSILON_FACTOR
*
(
multipole
[
3
]
*
yscale
*
p
hi
[
2
]
-
multipole
[
2
]
*
zscale
*
p
hi
[
3
]
+
2
*
(
multipole
[
6
]
-
multipole
[
5
])
*
yscale
*
zscale
*
atomP
hi
[
9
]
+
2
*
(
multipole
[
6
]
-
multipole
[
5
])
*
yscale
*
zscale
*
p
hi
[
9
]
+
multipole
[
8
]
*
xscale
*
yscale
*
atomP
hi
[
7
]
+
multipole
[
9
]
*
yscale
*
yscale
*
atomP
hi
[
5
]
+
multipole
[
8
]
*
xscale
*
yscale
*
p
hi
[
7
]
+
multipole
[
9
]
*
yscale
*
yscale
*
p
hi
[
5
]
-
multipole
[
7
]
*
xscale
*
zscale
*
atomP
hi
[
8
]
-
multipole
[
9
]
*
zscale
*
zscale
*
atomP
hi
[
6
])
*
0xFFFFFFFF
);
-
multipole
[
7
]
*
xscale
*
zscale
*
p
hi
[
8
]
-
multipole
[
9
]
*
zscale
*
zscale
*
p
hi
[
6
])
*
0xFFFFFFFF
);
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
]
=
(
long
long
)
(
EPSILON_FACTOR
*
(
multipole
[
1
]
*
zscale
*
atomP
hi
[
3
]
-
multipole
[
3
]
*
xscale
*
atomP
hi
[
1
]
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
]
=
(
long
long
)
(
EPSILON_FACTOR
*
(
multipole
[
1
]
*
zscale
*
p
hi
[
3
]
-
multipole
[
3
]
*
xscale
*
p
hi
[
1
]
+
2
*
(
multipole
[
4
]
-
multipole
[
6
])
*
xscale
*
zscale
*
atomP
hi
[
8
]
+
2
*
(
multipole
[
4
]
-
multipole
[
6
])
*
xscale
*
zscale
*
p
hi
[
8
]
+
multipole
[
7
]
*
yscale
*
zscale
*
atomP
hi
[
9
]
+
multipole
[
8
]
*
zscale
*
zscale
*
atomP
hi
[
6
]
+
multipole
[
7
]
*
yscale
*
zscale
*
p
hi
[
9
]
+
multipole
[
8
]
*
zscale
*
zscale
*
p
hi
[
6
]
-
multipole
[
8
]
*
xscale
*
xscale
*
atomP
hi
[
4
]
-
multipole
[
9
]
*
xscale
*
yscale
*
atomP
hi
[
7
])
*
0xFFFFFFFF
);
-
multipole
[
8
]
*
xscale
*
xscale
*
p
hi
[
4
]
-
multipole
[
9
]
*
xscale
*
yscale
*
p
hi
[
7
])
*
0xFFFFFFFF
);
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
*
2
]
=
(
long
long
)
(
EPSILON_FACTOR
*
(
multipole
[
2
]
*
xscale
*
atomP
hi
[
1
]
-
multipole
[
1
]
*
yscale
*
atomP
hi
[
2
]
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
*
2
]
=
(
long
long
)
(
EPSILON_FACTOR
*
(
multipole
[
2
]
*
xscale
*
p
hi
[
1
]
-
multipole
[
1
]
*
yscale
*
p
hi
[
2
]
+
2
*
(
multipole
[
5
]
-
multipole
[
4
])
*
xscale
*
yscale
*
atomP
hi
[
7
]
+
2
*
(
multipole
[
5
]
-
multipole
[
4
])
*
xscale
*
yscale
*
p
hi
[
7
]
+
multipole
[
7
]
*
xscale
*
xscale
*
atomP
hi
[
4
]
+
multipole
[
9
]
*
xscale
*
zscale
*
atomP
hi
[
8
]
+
multipole
[
7
]
*
xscale
*
xscale
*
p
hi
[
4
]
+
multipole
[
9
]
*
xscale
*
zscale
*
p
hi
[
8
]
-
multipole
[
7
]
*
yscale
*
yscale
*
atomP
hi
[
5
]
-
multipole
[
8
]
*
yscale
*
zscale
*
atomP
hi
[
9
])
*
0xFFFFFFFF
);
-
multipole
[
7
]
*
yscale
*
yscale
*
p
hi
[
5
]
-
multipole
[
8
]
*
yscale
*
zscale
*
p
hi
[
9
])
*
0xFFFFFFFF
);
// Compute the force and energy.
// Compute the force and energy.
...
@@ -731,10 +740,10 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
...
@@ -731,10 +740,10 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
real4
f
=
make_real4
(
0
,
0
,
0
,
0
);
real4
f
=
make_real4
(
0
,
0
,
0
,
0
);
for
(
int
k
=
0
;
k
<
10
;
k
++
)
{
for
(
int
k
=
0
;
k
<
10
;
k
++
)
{
energy
+=
multipole
[
k
]
*
atomP
hi
[
k
];
energy
+=
multipole
[
k
]
*
p
hi
[
k
];
f
.
x
+=
multipole
[
k
]
*
atomP
hi
[
deriv1
[
k
]];
f
.
x
+=
multipole
[
k
]
*
p
hi
[
deriv1
[
k
]];
f
.
y
+=
multipole
[
k
]
*
atomP
hi
[
deriv2
[
k
]];
f
.
y
+=
multipole
[
k
]
*
p
hi
[
deriv2
[
k
]];
f
.
z
+=
multipole
[
k
]
*
atomP
hi
[
deriv3
[
k
]];
f
.
z
+=
multipole
[
k
]
*
p
hi
[
deriv3
[
k
]];
}
}
f
.
x
*=
EPSILON_FACTOR
*
xscale
;
f
.
x
*=
EPSILON_FACTOR
*
xscale
;
f
.
y
*=
EPSILON_FACTOR
*
yscale
;
f
.
y
*=
EPSILON_FACTOR
*
yscale
;
...
@@ -746,141 +755,125 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
...
@@ -746,141 +755,125 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
0.5
f
*
EPSILON_FACTOR
*
energy
;
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
0.5
f
*
EPSILON_FACTOR
*
energy
;
}
}
//extern "C" __global__ void kComputeInducedDipoleForceAndEnergy_kernel() {
extern
"C"
__global__
void
computeInducedDipoleForceAndEnergy
(
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
// real multipole[10];
long
long
*
__restrict__
torqueBuffers
,
real
*
__restrict__
energyBuffer
,
const
real
*
__restrict__
labFrameDipole
,
// real inducedDipole[3];
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
inducedDipole_global
,
const
real
*
__restrict__
inducedDipolePolar_global
,
// real inducedDipolePolar[3];
const
real
*
__restrict__
phi_global
,
const
real
*
__restrict__
phid_global
,
const
real
*
__restrict__
phip_global
,
// real scales[3];
const
real
*
__restrict__
phidp_global
,
real4
invPeriodicBoxSize
)
{
// const int deriv1[] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19};
real
multipole
[
10
];
// const int deriv2[] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16};
real
inducedDipole
[
3
];
// const int deriv3[] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18};
real
inducedDipolePolar
[
3
];
// const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
real
scales
[
3
];
// const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
const
int
deriv1
[]
=
{
1
,
4
,
7
,
8
,
10
,
15
,
17
,
13
,
14
,
19
};
// const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
const
int
deriv2
[]
=
{
2
,
7
,
5
,
9
,
13
,
11
,
18
,
15
,
19
,
16
};
// scales[0] = xscale;
const
int
deriv3
[]
=
{
3
,
8
,
9
,
6
,
14
,
16
,
12
,
19
,
17
,
18
};
// scales[1] = yscale;
const
real
xscale
=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
;
// scales[2] = zscale;
const
real
yscale
=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
;
// real energy = 0;
const
real
zscale
=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
;
// for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
scales
[
0
]
=
xscale
;
// // Compute the torque.
scales
[
1
]
=
yscale
;
//
scales
[
2
]
=
zscale
;
// multipole[0] = posq[i].w;
real
energy
=
0
;
// multipole[1] = labFrameDipole[i*3];
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
// multipole[2] = labFrameDipole[i*3+1];
// Compute the torque.
// multipole[3] = labFrameDipole[i*3+2];
// multipole[4] = labFrameQuadrupole[i*5];
multipole
[
0
]
=
posq
[
i
].
w
;
// multipole[5] = labFrameQuadrupole[i*5+3];
multipole
[
1
]
=
labFrameDipole
[
i
*
3
];
// multipole[6] = -(multipole[4]+multipole[5]);
multipole
[
2
]
=
labFrameDipole
[
i
*
3
+
1
];
// multipole[7] = 2*labFrameQuadrupole[i*5+1];
multipole
[
3
]
=
labFrameDipole
[
i
*
3
+
2
];
// multipole[8] = 2*labFrameQuadrupole[i*5+2];
multipole
[
4
]
=
labFrameQuadrupole
[
i
*
5
];
// multipole[9] = 2*labFrameQuadrupole[i*5+4];
multipole
[
5
]
=
labFrameQuadrupole
[
i
*
5
+
3
];
// real* phidp = &cAmoebaSim.pPhidp[20*i];
multipole
[
6
]
=
-
(
multipole
[
4
]
+
multipole
[
5
]);
//
multipole
[
7
]
=
2
*
labFrameQuadrupole
[
i
*
5
+
1
];
// cAmoebaSim.pTorque[3*i] += 0.5f*EPSILON_FACTOR*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3]
multipole
[
8
]
=
2
*
labFrameQuadrupole
[
i
*
5
+
2
];
// + 2*(multipole[6]-multipole[5])*yscale*zscale*phidp[9]
multipole
[
9
]
=
2
*
labFrameQuadrupole
[
i
*
5
+
4
];
// + multipole[8]*xscale*yscale*phidp[7] + multipole[9]*yscale*yscale*phidp[5]
const
real
*
phidp
=
&
phidp_global
[
20
*
i
];
// - multipole[7]*xscale*zscale*phidp[8] - multipole[9]*zscale*zscale*phidp[6]);
//
torqueBuffers
[
i
]
+=
(
long
long
)
(
0.5
f
*
EPSILON_FACTOR
*
(
multipole
[
3
]
*
yscale
*
phidp
[
2
]
-
multipole
[
2
]
*
zscale
*
phidp
[
3
]
// cAmoebaSim.pTorque[3*i+1] += 0.5f*EPSILON_FACTOR*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1]
+
2
*
(
multipole
[
6
]
-
multipole
[
5
])
*
yscale
*
zscale
*
phidp
[
9
]
// + 2*(multipole[4]-multipole[6])*xscale*zscale*phidp[8]
+
multipole
[
8
]
*
xscale
*
yscale
*
phidp
[
7
]
+
multipole
[
9
]
*
yscale
*
yscale
*
phidp
[
5
]
// + multipole[7]*yscale*zscale*phidp[9] + multipole[8]*zscale*zscale*phidp[6]
-
multipole
[
7
]
*
xscale
*
zscale
*
phidp
[
8
]
-
multipole
[
9
]
*
zscale
*
zscale
*
phidp
[
6
])
*
0xFFFFFFFF
);
// - multipole[8]*xscale*xscale*phidp[4] - multipole[9]*xscale*yscale*phidp[7]);
//
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
]
+=
(
long
long
)
(
0.5
f
*
EPSILON_FACTOR
*
(
multipole
[
1
]
*
zscale
*
phidp
[
3
]
-
multipole
[
3
]
*
xscale
*
phidp
[
1
]
// cAmoebaSim.pTorque[3*i+2] += 0.5f*EPSILON_FACTOR*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2]
+
2
*
(
multipole
[
4
]
-
multipole
[
6
])
*
xscale
*
zscale
*
phidp
[
8
]
// + 2*(multipole[5]-multipole[4])*xscale*yscale*phidp[7]
+
multipole
[
7
]
*
yscale
*
zscale
*
phidp
[
9
]
+
multipole
[
8
]
*
zscale
*
zscale
*
phidp
[
6
]
// + multipole[7]*xscale*xscale*phidp[4] + multipole[9]*xscale*zscale*phidp[8]
-
multipole
[
8
]
*
xscale
*
xscale
*
phidp
[
4
]
-
multipole
[
9
]
*
xscale
*
yscale
*
phidp
[
7
])
*
0xFFFFFFFF
);
// - multipole[7]*yscale*yscale*phidp[5] - multipole[8]*yscale*zscale*phidp[9]);
//
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
*
2
]
+=
(
long
long
)
(
0.5
f
*
EPSILON_FACTOR
*
(
multipole
[
2
]
*
xscale
*
phidp
[
1
]
-
multipole
[
1
]
*
yscale
*
phidp
[
2
]
// // Compute the force and energy.
+
2
*
(
multipole
[
5
]
-
multipole
[
4
])
*
xscale
*
yscale
*
phidp
[
7
]
//
+
multipole
[
7
]
*
xscale
*
xscale
*
phidp
[
4
]
+
multipole
[
9
]
*
xscale
*
zscale
*
phidp
[
8
]
// multipole[1] *= xscale;
-
multipole
[
7
]
*
yscale
*
yscale
*
phidp
[
5
]
-
multipole
[
8
]
*
yscale
*
zscale
*
phidp
[
9
])
*
0xFFFFFFFF
);
// multipole[2] *= yscale;
// multipole[3] *= zscale;
// Compute the force and energy.
// multipole[4] *= xscale*xscale;
// multipole[5] *= yscale*yscale;
multipole
[
1
]
*=
xscale
;
// multipole[6] *= zscale*zscale;
multipole
[
2
]
*=
yscale
;
// multipole[7] *= xscale*yscale;
multipole
[
3
]
*=
zscale
;
// multipole[8] *= xscale*zscale;
multipole
[
4
]
*=
xscale
*
xscale
;
// multipole[9] *= yscale*zscale;
multipole
[
5
]
*=
yscale
*
yscale
;
//
multipole
[
6
]
*=
zscale
*
zscale
;
// inducedDipole[0] = cAmoebaSim.pInducedDipole[i*3];
multipole
[
7
]
*=
xscale
*
yscale
;
// inducedDipole[1] = cAmoebaSim.pInducedDipole[i*3+1];
multipole
[
8
]
*=
xscale
*
zscale
;
// inducedDipole[2] = cAmoebaSim.pInducedDipole[i*3+2];
multipole
[
9
]
*=
yscale
*
zscale
;
// inducedDipolePolar[0] = cAmoebaSim.pInducedDipolePolar[i*3];
// inducedDipolePolar[1] = cAmoebaSim.pInducedDipolePolar[i*3+1];
inducedDipole
[
0
]
=
inducedDipole_global
[
i
*
3
];
// inducedDipolePolar[2] = cAmoebaSim.pInducedDipolePolar[i*3+2];
inducedDipole
[
1
]
=
inducedDipole_global
[
i
*
3
+
1
];
// real* phi = &cAmoebaSim.pPhi[20*i];
inducedDipole
[
2
]
=
inducedDipole_global
[
i
*
3
+
2
];
// real* phip = &cAmoebaSim.pPhip[10*i];
inducedDipolePolar
[
0
]
=
inducedDipolePolar_global
[
i
*
3
];
// real* phid = &cAmoebaSim.pPhid[10*i];
inducedDipolePolar
[
1
]
=
inducedDipolePolar_global
[
i
*
3
+
1
];
// real4 f = make_real4(0, 0, 0, 0);
inducedDipolePolar
[
2
]
=
inducedDipolePolar_global
[
i
*
3
+
2
];
//
const
real
*
phi
=
&
phi_global
[
20
*
i
];
// energy += GRID_SIZE_X*invPeriodicBoxSize.x*inducedDipole[0]*phi[1];
const
real
*
phip
=
&
phip_global
[
10
*
i
];
// energy += GRID_SIZE_Y*invPeriodicBoxSize.y*inducedDipole[1]*phi[2];
const
real
*
phid
=
&
phid_global
[
10
*
i
];
// energy += GRID_SIZE_Z*invPeriodicBoxSize.z*inducedDipole[2]*phi[3];
real4
f
=
make_real4
(
0
,
0
,
0
,
0
);
//
// for (int k = 0; k < 3; k++) {
energy
+=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
*
inducedDipole
[
0
]
*
phi
[
1
];
//
energy
+=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
*
inducedDipole
[
1
]
*
phi
[
2
];
// int j1 = deriv1[k+1];
energy
+=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
*
inducedDipole
[
2
]
*
phi
[
3
];
// int j2 = deriv2[k+1];
// int j3 = deriv3[k+1];
for
(
int
k
=
0
;
k
<
3
;
k
++
)
{
//
int
j1
=
deriv1
[
k
+
1
];
// f.x += (inducedDipole[k]+inducedDipolePolar[k])*phi[j1]*(scales[k]/xscale);
int
j2
=
deriv2
[
k
+
1
];
// f.y += (inducedDipole[k]+inducedDipolePolar[k])*phi[j2]*(scales[k]/yscale);
int
j3
=
deriv3
[
k
+
1
];
// f.z += (inducedDipole[k]+inducedDipolePolar[k])*phi[j3]*(scales[k]/zscale);
f
.
x
+=
(
inducedDipole
[
k
]
+
inducedDipolePolar
[
k
])
*
phi
[
j1
]
*
(
scales
[
k
]
/
xscale
);
//
f
.
y
+=
(
inducedDipole
[
k
]
+
inducedDipolePolar
[
k
])
*
phi
[
j2
]
*
(
scales
[
k
]
/
yscale
);
// if( cAmoebaSim.polarizationType == 0 )
f
.
z
+=
(
inducedDipole
[
k
]
+
inducedDipolePolar
[
k
])
*
phi
[
j3
]
*
(
scales
[
k
]
/
zscale
);
// {
#ifndef DIRECT_POLARIZATION
// f.x += (inducedDipole[k]*phip[j1] + inducedDipolePolar[k]*phid[j1])*(scales[k]/xscale);
f
.
x
+=
(
inducedDipole
[
k
]
*
phip
[
j1
]
+
inducedDipolePolar
[
k
]
*
phid
[
j1
])
*
(
scales
[
k
]
/
xscale
);
// f.y += (inducedDipole[k]*phip[j2] + inducedDipolePolar[k]*phid[j2])*(scales[k]/yscale);
f
.
y
+=
(
inducedDipole
[
k
]
*
phip
[
j2
]
+
inducedDipolePolar
[
k
]
*
phid
[
j2
])
*
(
scales
[
k
]
/
yscale
);
// f.z += (inducedDipole[k]*phip[j3] + inducedDipolePolar[k]*phid[j3])*(scales[k]/zscale);
f
.
z
+=
(
inducedDipole
[
k
]
*
phip
[
j3
]
+
inducedDipolePolar
[
k
]
*
phid
[
j3
])
*
(
scales
[
k
]
/
zscale
);
// }
#endif
//
}
//
// }
f
.
x
*=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
;
//
f
.
y
*=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
;
// f.x *= GRID_SIZE_X*invPeriodicBoxSize.x;
f
.
z
*=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
;
// f.y *= GRID_SIZE_Y*invPeriodicBoxSize.y;
for
(
int
k
=
0
;
k
<
10
;
k
++
)
{
// f.z *= GRID_SIZE_Z*invPeriodicBoxSize.z;
f
.
x
+=
multipole
[
k
]
*
phidp
[
deriv1
[
k
]];
// for (int k = 0; k < 10; k++) {
f
.
y
+=
multipole
[
k
]
*
phidp
[
deriv2
[
k
]];
// f.x += multipole[k]*phidp[deriv1[k]];
f
.
z
+=
multipole
[
k
]
*
phidp
[
deriv3
[
k
]];
// f.y += multipole[k]*phidp[deriv2[k]];
}
// f.z += multipole[k]*phidp[deriv3[k]];
f
.
x
*=
0.5
f
*
EPSILON_FACTOR
*
xscale
;
// }
f
.
y
*=
0.5
f
*
EPSILON_FACTOR
*
yscale
;
//
f
.
z
*=
0.5
f
*
EPSILON_FACTOR
*
zscale
;
// f.x *= 0.5f*EPSILON_FACTOR*xscale;
forceBuffers
[
i
]
-=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f
.
x
*
0xFFFFFFFF
));
// f.y *= 0.5f*EPSILON_FACTOR*yscale;
forceBuffers
[
i
+
PADDED_NUM_ATOMS
]
-=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f
.
y
*
0xFFFFFFFF
));
// f.z *= 0.5f*EPSILON_FACTOR*zscale;
forceBuffers
[
i
+
PADDED_NUM_ATOMS
*
2
]
-=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f
.
z
*
0xFFFFFFFF
));
//
}
// real4 force = cSim.pForce4[i];
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
0.5
f
*
EPSILON_FACTOR
*
energy
;
// force.x -= f.x;
}
// force.y -= f.y;
// force.z -= f.z;
extern
"C"
__global__
void
recordInducedFieldDipoles
(
const
real
*
__restrict__
phid
,
real
*
const
__restrict__
phip
,
// cSim.pForce4[i] = force;
real
*
__restrict__
inducedDipole
,
real
*
__restrict__
inducedDipolePolar
,
real4
invPeriodicBoxSize
)
{
// }
real
xscale
=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
;
// cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*EPSILON_FACTOR*energy;
real
yscale
=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
;
//}
real
zscale
=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
;
//
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
//extern "C" __global__ void kRecordFixedMultipoleField_kernel(real* output) {
inducedDipole
[
3
*
i
]
-=
xscale
*
phid
[
10
*
i
+
1
];
// const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
inducedDipole
[
3
*
i
+
1
]
-=
yscale
*
phid
[
10
*
i
+
2
];
// const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
inducedDipole
[
3
*
i
+
2
]
-=
zscale
*
phid
[
10
*
i
+
3
];
// const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
inducedDipolePolar
[
3
*
i
]
-=
xscale
*
phip
[
10
*
i
+
1
];
// for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
inducedDipolePolar
[
3
*
i
+
1
]
-=
yscale
*
phip
[
10
*
i
+
2
];
// output[3*i] = -xscale*cAmoebaSim.pPhi[20*i+1];
inducedDipolePolar
[
3
*
i
+
2
]
-=
zscale
*
phip
[
10
*
i
+
3
];
// output[3*i+1] = -yscale*cAmoebaSim.pPhi[20*i+2];
}
// output[3*i+2] = -zscale*cAmoebaSim.pPhi[20*i+3];
}
// }
//}
//
//extern "C" __global__ void kRecordInducedDipoleField_kernel(real* output, real* outputPolar) {
// const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
// const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
// const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
// for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
// output[3*i] -= xscale*cAmoebaSim.pPhid[10*i+1];
// output[3*i+1] -= yscale*cAmoebaSim.pPhid[10*i+2];
// output[3*i+2] -= zscale*cAmoebaSim.pPhid[10*i+3];
// outputPolar[3*i] -= xscale*cAmoebaSim.pPhip[10*i+1];
// outputPolar[3*i+1] -= yscale*cAmoebaSim.pPhip[10*i+2];
// outputPolar[3*i+2] -= zscale*cAmoebaSim.pPhip[10*i+3];
// }
//}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment