Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
f352d116
Commit
f352d116
authored
Aug 18, 2012
by
Peter Eastman
Browse files
Continuing to convert AmoebaMultipoleForce: PME with direct polarization now works
parent
7a60fd73
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
486 additions
and
501 deletions
+486
-501
plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
+47
-56
plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
+2
-1
plugins/amoeba/platforms/cuda2/src/kernels/multipolePme.cu
plugins/amoeba/platforms/cuda2/src/kernels/multipolePme.cu
+437
-444
No files found.
plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.cpp
View file @
f352d116
...
...
@@ -1166,11 +1166,13 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
pmeUpdateBsplinesKernel
=
cu
.
getKernel
(
module
,
"updateBsplines"
);
pmeAtomRangeKernel
=
cu
.
getKernel
(
module
,
"findAtomRangeForGrid"
);
pmeSpreadFixedMultipolesKernel
=
cu
.
getKernel
(
module
,
"gridSpreadFixedMultipoles"
);
pmeSpreadInducedDipolesKernel
=
cu
.
getKernel
(
module
,
"gridSpreadInducedDipoles"
);
pmeConvolutionKernel
=
cu
.
getKernel
(
module
,
"reciprocalConvolution"
);
pmeFixedPotentialKernel
=
cu
.
getKernel
(
module
,
"computeFixedPotentialFromGrid"
);
pmeInducedPotentialKernel
=
cu
.
getKernel
(
module
,
"computeInducedPotentialFromGrid"
);
pmeFixedForceKernel
=
cu
.
getKernel
(
module
,
"computeFixedMultipoleForceAndEnergy"
);
//
pmeIn
terpolate
ForceKernel = cu.getKernel(module, "
gridInter
pol
at
eForce");
//
pme
FinishSpreadCharge
Kernel = cu.getKernel(module, "
finishSpreadCharge
");
pmeIn
duced
ForceKernel
=
cu
.
getKernel
(
module
,
"
computeInducedDi
poleForce
AndEnergy
"
);
pme
RecordInducedFieldDipoles
Kernel
=
cu
.
getKernel
(
module
,
"
recordInducedFieldDipoles
"
);
// cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
// Create required data structures.
...
...
@@ -1415,12 +1417,15 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
dampingAndThole
->
getDevicePointer
()};
cu
.
executeKernel
(
electrostaticsKernel
,
electrostaticsArgs
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
// Map torques to force.
void
*
mapTorqueArgs
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
torque
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
multipoleParticles
->
getDevicePointer
()};
cu
.
executeKernel
(
mapTorqueKernel
,
mapTorqueArgs
,
cu
.
getNumAtoms
());
}
else
{
//
Compute induced dipoles
.
//
Reciprocal space calculation
.
unsigned
int
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
void
*
pmeUpdateBsplinesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
pmeIgrid
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
...
...
@@ -1433,9 +1438,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cu
.
executeKernel
(
pmeAtomRangeKernel
,
pmeAtomRangeArgs
,
cu
.
getNumAtoms
(),
cu
.
ThreadBlockSize
,
cu
.
ThreadBlockSize
*
PmeOrder
*
PmeOrder
*
elementSize
);
void
*
pmeSpreadFixedMultipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
pmeGrid
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
&
pmeAtomRange
->
getDevicePointer
(),
&
pmeTheta1
->
getDevicePointer
(),
&
pmeTheta2
->
getDevicePointer
(),
&
pmeTheta3
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeSpreadFixedMultipolesKernel
,
pmeSpreadFixedMultipolesArgs
,
cu
.
getNumAtoms
(),
cu
.
ThreadBlockSize
,
cu
.
ThreadBlockSize
*
PmeOrder
*
PmeOrder
*
elementSize
);
&
pmeTheta1
->
getDevicePointer
(),
&
pmeTheta2
->
getDevicePointer
(),
&
pmeTheta3
->
getDevicePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeSpreadFixedMultipolesKernel
,
pmeSpreadFixedMultipolesArgs
,
cu
.
getNumAtoms
());
if
(
cu
.
getUseDoublePrecision
())
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
else
...
...
@@ -1448,23 +1452,16 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
else
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
void
*
pmeFixedPotentialArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmePhi
->
getDevicePointer
(),
&
field
->
getDevicePointer
(),
&
pmeIgrid
->
getDevicePointer
(),
&
pmeTheta1
->
getDevicePointer
(),
&
pmeTheta2
->
getDevicePointer
(),
&
pmeTheta3
->
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
&
fieldPolar
->
getDevicePointer
(),
&
pmeIgrid
->
getDevicePointer
(),
&
pmeTheta1
->
getDevicePointer
(),
&
pmeTheta2
->
getDevicePointer
(),
&
pmeTheta3
->
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeFixedPotentialKernel
,
pmeFixedPotentialArgs
,
cu
.
getNumAtoms
());
void
*
pmeFixedForceArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
torque
->
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
pmePhi
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
&
pmePhi
->
getDevicePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeFixedForceKernel
,
pmeFixedForceArgs
,
cu
.
getNumAtoms
());
printf
(
"reciprocal:
\n
"
);
vector
<
long
long
>
f
;
printf
(
"force
\n
"
);
cu
.
getForce
().
download
(
f
);
for
(
int
i
=
0
;
i
<
cu
.
getNumAtoms
();
i
++
)
printf
(
"%d: %g %g %g
\n
"
,
i
,
f
[
i
]
/
(
double
)
0xFFFFFFFF
,
f
[
i
+
cu
.
getPaddedNumAtoms
()]
/
(
double
)
0xFFFFFFFF
,
f
[
i
+
cu
.
getPaddedNumAtoms
()
*
2
]
/
(
double
)
0xFFFFFFFF
);
// printf("torque\n");
// torque->download(f);
// for (int i = 0; i < cu.getNumAtoms(); i++)
// printf("%d: %g %g %g\n", i, f[i]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()*2]/(double) 0xFFFFFFFF);
// Direct space calculation.
void
*
computeFixedFieldArgs
[]
=
{
&
field
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
nb
.
getExclusionIndices
().
getDevicePointer
(),
&
nb
.
getExclusionRowIndices
().
getDevicePointer
(),
&
covalentFlags
->
getDevicePointer
(),
&
polarizationGroupFlags
->
getDevicePointer
(),
&
startTileIndex
,
&
numTileIndices
,
...
...
@@ -1475,38 +1472,29 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
void
*
recordInducedDipolesArgs
[]
=
{
&
field
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
polarizability
->
getDevicePointer
()};
cu
.
executeKernel
(
recordInducedDipolesKernel
,
recordInducedDipolesArgs
,
cu
.
getNumAtoms
());
printf
(
"direct:
\n
"
);
printf
(
"force
\n
"
);
cu
.
getForce
().
download
(
f
);
for
(
int
i
=
0
;
i
<
cu
.
getNumAtoms
();
i
++
)
printf
(
"%d: %g %g %g
\n
"
,
i
,
f
[
i
]
/
(
double
)
0xFFFFFFFF
,
f
[
i
+
cu
.
getPaddedNumAtoms
()]
/
(
double
)
0xFFFFFFFF
,
f
[
i
+
cu
.
getPaddedNumAtoms
()
*
2
]
/
(
double
)
0xFFFFFFFF
);
// printf("torque\n");
// torque->download(f);
// for (int i = 0; i < cu.getNumAtoms(); i++)
// printf("%d: %g %g %g\n", i, f[i]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()*2]/(double) 0xFFFFFFFF);
// vector<float> d, dp;
// printf("phi\n");
// pmePhi->download(d);
// for (int i = 0; i < d.size(); i++)
// printf("%d: %g\n", i, d[i]);
// printf("dipoles\n");
// labFrameDipoles->download(d);
// for (int i = 0; i < cu.getNumAtoms(); i++)
// printf("%d: %g %g %g\n", i, d[3*i], d[3*i+1], d[3*i+2]);
// printf("quadrupoles\n");
// labFrameQuadrupoles->download(d);
// for (int i = 0; i < cu.getNumAtoms(); i++)
// printf("%d: %g %g %g %g %g %g\n", i, d[5*i], d[5*i+1], d[5*i+2], d[5*i+3], d[5*i+4], -(d[5*i]+d[5*i+3]));
// printf("induced dipoles\n");
// inducedDipole->download(d);
// inducedDipolePolar->download(dp);
// for (int i = 0; i < cu.getNumAtoms(); i++)
// printf("%d: %g %g %g, %g %g %g\n", i, d[3*i], d[3*i+1], d[3*i+2], dp[3*i], dp[3*i+1], dp[3*i+2]);
// printf("positions\n");
// vector<float4> p;
// cu.getPosq().download(p);
// for (int i = 0; i < cu.getNumAtoms(); i++)
// printf("%d: %g %g %g %g\n", i, p[i].x, p[i].y, p[i].z, p[i].w);
// Reciprocal space calculation for the induced dipoles.
void
*
pmeSpreadInducedDipolesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
pmeGrid
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
&
pmeAtomRange
->
getDevicePointer
(),
&
pmeTheta1
->
getDevicePointer
(),
&
pmeTheta2
->
getDevicePointer
(),
&
pmeTheta3
->
getDevicePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeSpreadInducedDipolesKernel
,
pmeSpreadInducedDipolesArgs
,
cu
.
getNumAtoms
());
if
(
cu
.
getUseDoublePrecision
())
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
else
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
cu
.
executeKernel
(
pmeConvolutionKernel
,
pmeConvolutionArgs
,
cu
.
getNumAtoms
());
if
(
cu
.
getUseDoublePrecision
())
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
else
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
void
*
pmeInducedPotentialArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
pmePhid
->
getDevicePointer
(),
&
pmePhip
->
getDevicePointer
(),
&
pmePhidp
->
getDevicePointer
(),
&
pmeIgrid
->
getDevicePointer
(),
&
pmeTheta1
->
getDevicePointer
(),
&
pmeTheta2
->
getDevicePointer
(),
&
pmeTheta3
->
getDevicePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeInducedPotentialKernel
,
pmeInducedPotentialArgs
,
cu
.
getNumAtoms
());
// void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
// &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()};
// cu.executeKernel(pmeRecordInducedFieldDipolesKernel, pmeRecordInducedFieldDipolesArgs, cu.getNumAtoms());
// vector<float2> errors;
...
...
@@ -1541,11 +1529,14 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
dampingAndThole
->
getDevicePointer
()};
cu
.
executeKernel
(
electrostaticsKernel
,
electrostaticsArgs
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
printf
(
"electrostatic:
\n
"
);
printf
(
"force
\n
"
);
cu
.
getForce
().
download
(
f
);
for
(
int
i
=
0
;
i
<
cu
.
getNumAtoms
();
i
++
)
printf
(
"%d: %g %g %g
\n
"
,
i
,
f
[
i
]
/
(
double
)
0xFFFFFFFF
,
f
[
i
+
cu
.
getPaddedNumAtoms
()]
/
(
double
)
0xFFFFFFFF
,
f
[
i
+
cu
.
getPaddedNumAtoms
()
*
2
]
/
(
double
)
0xFFFFFFFF
);
void
*
pmeInducedForceArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
torque
->
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
pmePhi
->
getDevicePointer
(),
&
pmePhid
->
getDevicePointer
(),
&
pmePhip
->
getDevicePointer
(),
&
pmePhidp
->
getDevicePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeInducedForceKernel
,
pmeInducedForceArgs
,
cu
.
getNumAtoms
());
// Map torques to force.
void
*
mapTorqueArgs
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
torque
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
multipoleParticles
->
getDevicePointer
()};
cu
.
executeKernel
(
mapTorqueKernel
,
mapTorqueArgs
,
cu
.
getNumAtoms
());
...
...
plugins/amoeba/platforms/cuda2/src/AmoebaCudaKernels.h
View file @
f352d116
...
...
@@ -425,7 +425,8 @@ private:
CudaSort
*
sort
;
cufftHandle
fft
;
CUfunction
computeMomentsKernel
,
recordInducedDipolesKernel
,
computeFixedFieldKernel
,
computeInducedFieldKernel
,
updateInducedFieldKernel
,
electrostaticsKernel
,
mapTorqueKernel
;
CUfunction
pmeUpdateBsplinesKernel
,
pmeAtomRangeKernel
,
pmeSpreadFixedMultipolesKernel
,
pmeConvolutionKernel
,
pmeFixedPotentialKernel
,
pmeFixedForceKernel
;
CUfunction
pmeUpdateBsplinesKernel
,
pmeAtomRangeKernel
,
pmeSpreadFixedMultipolesKernel
,
pmeSpreadInducedDipolesKernel
,
pmeConvolutionKernel
,
pmeFixedPotentialKernel
,
pmeInducedPotentialKernel
;
CUfunction
pmeFixedForceKernel
,
pmeInducedForceKernel
,
pmeRecordInducedFieldDipolesKernel
;
static
const
int
PmeOrder
=
5
;
};
...
...
plugins/amoeba/platforms/cuda2/src/kernels/multipolePme.cu
View file @
f352d116
...
...
@@ -161,8 +161,7 @@ extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIn
}
extern
"C"
__global__
void
gridSpreadFixedMultipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
int
*
__restrict__
pmeAtomRange
,
const
real4
*
__restrict__
theta1
,
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
const
real4
*
__restrict__
theta1
,
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
real4
invPeriodicBoxSize
)
{
const
real
xscale
=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
;
const
real
yscale
=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
;
const
real
zscale
=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
;
...
...
@@ -248,88 +247,90 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
}
}
//extern "C" __global__ void kGridSpreadInducedDipoles_kernel() {
// const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
// const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
// const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
// unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
// unsigned int numThreads = gridDim.x*blockDim.x;
// for (int gridIndex = blockIdx.x*blockDim.x+threadIdx.x; gridIndex < numGridPoints; gridIndex += numThreads) {
// int3 gridPoint;
// gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
// int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
// gridPoint.y = remainder/GRID_SIZE_Z;
// gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
// real2 result = make_real2(0, 0);
// for (int ix = 0; ix < PME_ORDER; ++ix) {
// int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
// for (int iy = 0; iy < PME_ORDER; ++iy) {
// int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y);
// int z1 = gridPoint.z-PME_ORDER+1;
// z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
// int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1);
// int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1;
// int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
// int firstAtom = pmeAtomRange[gridIndex1];
// int lastAtom = pmeAtomRange[gridIndex2+1];
// for (int i = firstAtom; i < lastAtom; ++i) {
// int2 atomData = pmeAtomGridIndex[i];
// int atomIndex = atomData.x;
// int z = atomData.y;
// int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
// if (iz >= GRID_SIZE_Z)
// iz -= GRID_SIZE_Z;
// real inducedDipoleX = xscale*cAmoebaSim.pInducedDipole[atomIndex*3];
// real inducedDipoleY = yscale*cAmoebaSim.pInducedDipole[atomIndex*3+1];
// real inducedDipoleZ = zscale*cAmoebaSim.pInducedDipole[atomIndex*3+2];
// real inducedDipolePolarX = xscale*cAmoebaSim.pInducedDipolePolar[atomIndex*3];
// real inducedDipolePolarY = yscale*cAmoebaSim.pInducedDipolePolar[atomIndex*3+1];
// real inducedDipolePolarZ = zscale*cAmoebaSim.pInducedDipolePolar[atomIndex*3+2];
// real4 t = theta1[atomIndex*PME_ORDER+ix];
// real4 u = theta2[atomIndex*PME_ORDER+iy];
// real4 v = theta3[atomIndex*PME_ORDER+iz];
// real term01 = inducedDipoleY*u.y*v.x + inducedDipoleZ*u.x*v.y;
// real term11 = inducedDipoleX*u.x*v.x;
// real term02 = inducedDipolePolarY*u.y*v.x + inducedDipolePolarZ*u.x*v.y;
// real term12 = inducedDipolePolarX*u.x*v.x;
// result.x += term01*t.x + term11*t.y;
// result.y += term02*t.x + term12*t.y;
// }
// if (z1 > gridPoint.z) {
// gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z;
// gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z;
// firstAtom = pmeAtomRange[gridIndex1];
// lastAtom = pmeAtomRange[gridIndex2+1];
// for (int i = firstAtom; i < lastAtom; ++i) {
// int2 atomData = pmeAtomGridIndex[i];
// int atomIndex = atomData.x;
// int z = atomData.y;
// int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
// if (iz >= GRID_SIZE_Z)
// iz -= GRID_SIZE_Z;
// real inducedDipoleX = xscale*cAmoebaSim.pInducedDipole[atomIndex*3];
// real inducedDipoleY = yscale*cAmoebaSim.pInducedDipole[atomIndex*3+1];
// real inducedDipoleZ = zscale*cAmoebaSim.pInducedDipole[atomIndex*3+2];
// real inducedDipolePolarX = xscale*cAmoebaSim.pInducedDipolePolar[atomIndex*3];
// real inducedDipolePolarY = yscale*cAmoebaSim.pInducedDipolePolar[atomIndex*3+1];
// real inducedDipolePolarZ = zscale*cAmoebaSim.pInducedDipolePolar[atomIndex*3+2];
// real4 t = theta1[atomIndex*PME_ORDER+ix];
// real4 u = theta2[atomIndex*PME_ORDER+iy];
// real4 v = theta3[atomIndex*PME_ORDER+iz];
// real term01 = inducedDipoleY*u.y*v.x + inducedDipoleZ*u.x*v.y;
// real term11 = inducedDipoleX*u.x*v.x;
// real term02 = inducedDipolePolarY*u.y*v.x + inducedDipolePolarZ*u.x*v.y;
// real term12 = inducedDipolePolarX*u.x*v.x;
// result.x += term01*t.x + term11*t.y;
// result.y += term02*t.x + term12*t.y;
// }
// }
// }
// }
// pmeGrid[gridIndex] = result;
// }
//}
//
extern
"C"
__global__
void
gridSpreadInducedDipoles
(
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
real2
*
__restrict__
pmeGrid
,
int2
*
__restrict__
pmeAtomGridIndex
,
int
*
__restrict__
pmeAtomRange
,
const
real4
*
__restrict__
theta1
,
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
real4
invPeriodicBoxSize
)
{
const
real
xscale
=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
;
const
real
yscale
=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
;
const
real
zscale
=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
;
unsigned
int
numGridPoints
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
unsigned
int
numThreads
=
gridDim
.
x
*
blockDim
.
x
;
for
(
int
gridIndex
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
gridIndex
<
numGridPoints
;
gridIndex
+=
numThreads
)
{
int3
gridPoint
;
gridPoint
.
x
=
gridIndex
/
(
GRID_SIZE_Y
*
GRID_SIZE_Z
);
int
remainder
=
gridIndex
-
gridPoint
.
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
gridPoint
.
y
=
remainder
/
GRID_SIZE_Z
;
gridPoint
.
z
=
remainder
-
gridPoint
.
y
*
GRID_SIZE_Z
;
real2
result
=
make_real2
(
0
,
0
);
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
++
ix
)
{
int
x
=
gridPoint
.
x
-
ix
+
(
gridPoint
.
x
>=
ix
?
0
:
GRID_SIZE_X
);
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
++
iy
)
{
int
y
=
gridPoint
.
y
-
iy
+
(
gridPoint
.
y
>=
iy
?
0
:
GRID_SIZE_Y
);
int
z1
=
gridPoint
.
z
-
PME_ORDER
+
1
;
z1
+=
(
z1
>=
0
?
0
:
GRID_SIZE_Z
);
int
z2
=
(
z1
<
gridPoint
.
z
?
gridPoint
.
z
:
GRID_SIZE_Z
-
1
);
int
gridIndex1
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
z1
;
int
gridIndex2
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
z2
;
int
firstAtom
=
pmeAtomRange
[
gridIndex1
];
int
lastAtom
=
pmeAtomRange
[
gridIndex2
+
1
];
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
int2
atomData
=
pmeAtomGridIndex
[
i
];
int
atomIndex
=
atomData
.
x
;
int
z
=
atomData
.
y
;
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
GRID_SIZE_Z
);
if
(
iz
>=
GRID_SIZE_Z
)
iz
-=
GRID_SIZE_Z
;
real
inducedDipoleX
=
xscale
*
inducedDipole
[
atomIndex
*
3
];
real
inducedDipoleY
=
yscale
*
inducedDipole
[
atomIndex
*
3
+
1
];
real
inducedDipoleZ
=
zscale
*
inducedDipole
[
atomIndex
*
3
+
2
];
real
inducedDipolePolarX
=
xscale
*
inducedDipolePolar
[
atomIndex
*
3
];
real
inducedDipolePolarY
=
yscale
*
inducedDipolePolar
[
atomIndex
*
3
+
1
];
real
inducedDipolePolarZ
=
zscale
*
inducedDipolePolar
[
atomIndex
*
3
+
2
];
real4
t
=
theta1
[
atomIndex
*
PME_ORDER
+
ix
];
real4
u
=
theta2
[
atomIndex
*
PME_ORDER
+
iy
];
real4
v
=
theta3
[
atomIndex
*
PME_ORDER
+
iz
];
real
term01
=
inducedDipoleY
*
u
.
y
*
v
.
x
+
inducedDipoleZ
*
u
.
x
*
v
.
y
;
real
term11
=
inducedDipoleX
*
u
.
x
*
v
.
x
;
real
term02
=
inducedDipolePolarY
*
u
.
y
*
v
.
x
+
inducedDipolePolarZ
*
u
.
x
*
v
.
y
;
real
term12
=
inducedDipolePolarX
*
u
.
x
*
v
.
x
;
result
.
x
+=
term01
*
t
.
x
+
term11
*
t
.
y
;
result
.
y
+=
term02
*
t
.
x
+
term12
*
t
.
y
;
}
if
(
z1
>
gridPoint
.
z
)
{
gridIndex1
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
;
gridIndex2
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
gridPoint
.
z
;
firstAtom
=
pmeAtomRange
[
gridIndex1
];
lastAtom
=
pmeAtomRange
[
gridIndex2
+
1
];
for
(
int
i
=
firstAtom
;
i
<
lastAtom
;
++
i
)
{
int2
atomData
=
pmeAtomGridIndex
[
i
];
int
atomIndex
=
atomData
.
x
;
int
z
=
atomData
.
y
;
int
iz
=
gridPoint
.
z
-
z
+
(
gridPoint
.
z
>=
z
?
0
:
GRID_SIZE_Z
);
if
(
iz
>=
GRID_SIZE_Z
)
iz
-=
GRID_SIZE_Z
;
real
inducedDipoleX
=
xscale
*
inducedDipole
[
atomIndex
*
3
];
real
inducedDipoleY
=
yscale
*
inducedDipole
[
atomIndex
*
3
+
1
];
real
inducedDipoleZ
=
zscale
*
inducedDipole
[
atomIndex
*
3
+
2
];
real
inducedDipolePolarX
=
xscale
*
inducedDipolePolar
[
atomIndex
*
3
];
real
inducedDipolePolarY
=
yscale
*
inducedDipolePolar
[
atomIndex
*
3
+
1
];
real
inducedDipolePolarZ
=
zscale
*
inducedDipolePolar
[
atomIndex
*
3
+
2
];
real4
t
=
theta1
[
atomIndex
*
PME_ORDER
+
ix
];
real4
u
=
theta2
[
atomIndex
*
PME_ORDER
+
iy
];
real4
v
=
theta3
[
atomIndex
*
PME_ORDER
+
iz
];
real
term01
=
inducedDipoleY
*
u
.
y
*
v
.
x
+
inducedDipoleZ
*
u
.
x
*
v
.
y
;
real
term11
=
inducedDipoleX
*
u
.
x
*
v
.
x
;
real
term02
=
inducedDipolePolarY
*
u
.
y
*
v
.
x
+
inducedDipolePolarZ
*
u
.
x
*
v
.
y
;
real
term12
=
inducedDipolePolarX
*
u
.
x
*
v
.
x
;
result
.
x
+=
term01
*
t
.
x
+
term11
*
t
.
y
;
result
.
y
+=
term02
*
t
.
x
+
term12
*
t
.
y
;
}
}
}
}
pmeGrid
[
gridIndex
]
=
result
;
}
}
extern
"C"
__global__
void
reciprocalConvolution
(
real2
*
__restrict__
pmeGrid
,
const
real
*
__restrict__
pmeBsplineModuliX
,
const
real
*
__restrict__
pmeBsplineModuliY
,
const
real
*
__restrict__
pmeBsplineModuliZ
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
...
...
@@ -362,7 +363,7 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
}
extern
"C"
__global__
void
computeFixedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phi
,
long
long
*
__restrict__
fieldBuffers
,
const
int4
*
__restrict__
igrid
,
const
real4
*
__restrict__
theta1
,
long
long
*
__restrict__
fieldBuffers
,
long
long
*
__restrict__
fieldPolarBuffers
,
const
int4
*
__restrict__
igrid
,
const
real4
*
__restrict__
theta1
,
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
const
real
*
__restrict__
labFrameDipole
,
real4
invPeriodicBoxSize
)
{
// extract the permanent multipole field at each site
...
...
@@ -468,216 +469,224 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
phi
[
20
*
m
+
18
]
=
tuv012
;
phi
[
20
*
m
+
19
]
=
tuv111
;
real
dipoleScale
=
(
4
/
(
real
)
3
)
*
(
EWALD_ALPHA
*
EWALD_ALPHA
*
EWALD_ALPHA
)
/
SQRT
(
M_PI
);
fieldBuffers
[
m
]
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
]
-
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
*
tuv100
)
*
0xFFFFFFFF
);
fieldBuffers
[
m
+
PADDED_NUM_ATOMS
]
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
+
1
]
-
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
*
tuv010
)
*
0xFFFFFFFF
);
fieldBuffers
[
m
+
2
*
PADDED_NUM_ATOMS
]
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
+
2
]
-
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
*
tuv001
)
*
0xFFFFFFFF
);
long
long
fieldx
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
]
-
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
*
tuv100
)
*
0xFFFFFFFF
);
fieldBuffers
[
m
]
=
fieldx
;
fieldPolarBuffers
[
m
]
=
fieldx
;
long
long
fieldy
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
+
1
]
-
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
*
tuv010
)
*
0xFFFFFFFF
);
fieldBuffers
[
m
+
PADDED_NUM_ATOMS
]
=
fieldy
;
fieldPolarBuffers
[
m
+
PADDED_NUM_ATOMS
]
=
fieldy
;
long
long
fieldz
=
(
long
long
)
((
dipoleScale
*
labFrameDipole
[
m
*
3
+
2
]
-
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
*
tuv001
)
*
0xFFFFFFFF
);
fieldBuffers
[
m
+
2
*
PADDED_NUM_ATOMS
]
=
fieldz
;
fieldPolarBuffers
[
m
+
2
*
PADDED_NUM_ATOMS
]
=
fieldz
;
}
}
//extern "C" __global__ void kComputeInducedPotentialFromGrid_kernel() {
// // extract the induced dipole field at each site
//
// for (int m = blockIdx.x*blockDim.x+threadIdx.x; m < NUM_ATOMS; m += blockDim.x*gridDim.x) {
// int4 gridPoint = igrid[m];
// real tuv100_1 = 0;
// real tuv010_1 = 0;
// real tuv001_1 = 0;
// real tuv200_1 = 0;
// real tuv020_1 = 0;
// real tuv002_1 = 0;
// real tuv110_1 = 0;
// real tuv101_1 = 0;
// real tuv011_1 = 0;
// real tuv100_2 = 0;
// real tuv010_2 = 0;
// real tuv001_2 = 0;
// real tuv200_2 = 0;
// real tuv020_2 = 0;
// real tuv002_2 = 0;
// real tuv110_2 = 0;
// real tuv101_2 = 0;
// real tuv011_2 = 0;
// real tuv000 = 0;
// real tuv001 = 0;
// real tuv010 = 0;
// real tuv100 = 0;
// real tuv200 = 0;
// real tuv020 = 0;
// real tuv002 = 0;
// real tuv110 = 0;
// real tuv101 = 0;
// real tuv011 = 0;
// real tuv300 = 0;
// real tuv030 = 0;
// real tuv003 = 0;
// real tuv210 = 0;
// real tuv201 = 0;
// real tuv120 = 0;
// real tuv021 = 0;
// real tuv102 = 0;
// real tuv012 = 0;
// real tuv111 = 0;
// for (int iz = 0; iz < PME_ORDER; iz++) {
// int k = gridPoint.z+iz-(gridPoint.z+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
// real4 v = theta3[m*PME_ORDER+iz];
// real tu00_1 = 0;
// real tu01_1 = 0;
// real tu10_1 = 0;
// real tu20_1 = 0;
// real tu11_1 = 0;
// real tu02_1 = 0;
// real tu00_2 = 0;
// real tu01_2 = 0;
// real tu10_2 = 0;
// real tu20_2 = 0;
// real tu11_2 = 0;
// real tu02_2 = 0;
// real tu00 = 0;
// real tu10 = 0;
// real tu01 = 0;
// real tu20 = 0;
// real tu11 = 0;
// real tu02 = 0;
// real tu30 = 0;
// real tu21 = 0;
// real tu12 = 0;
// real tu03 = 0;
// for (int iy = 0; iy < PME_ORDER; iy++) {
// int j = gridPoint.y+iy-(gridPoint.y+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
// real4 u = theta2[m*PME_ORDER+iy];
// real t0_1 = 0;
// real t1_1 = 0;
// real t2_1 = 0;
// real t0_2 = 0;
// real t1_2 = 0;
// real t2_2 = 0;
// real t3 = 0;
// for (int ix = 0; ix < PME_ORDER; ix++) {
// int i = gridPoint.x+ix-(gridPoint.x+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
// int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k;
// real2 tq = pmeGrid[gridIndex];
// real4 tadd = theta1[m*PME_ORDER+ix];
// t0_1 += tq.x*tadd.x;
// t1_1 += tq.x*tadd.y;
// t2_1 += tq.x*tadd.z;
// t0_2 += tq.y*tadd.x;
// t1_2 += tq.y*tadd.y;
// t2_2 += tq.y*tadd.z;
// t3 += (tq.x+tq.y)*tadd.w;
// }
// tu00_1 += t0_1*u.x;
// tu10_1 += t1_1*u.x;
// tu01_1 += t0_1*u.y;
// tu20_1 += t2_1*u.x;
// tu11_1 += t1_1*u.y;
// tu02_1 += t0_1*u.z;
// tu00_2 += t0_2*u.x;
// tu10_2 += t1_2*u.x;
// tu01_2 += t0_2*u.y;
// tu20_2 += t2_2*u.x;
// tu11_2 += t1_2*u.y;
// tu02_2 += t0_2*u.z;
// real t0 = t0_1 + t0_2;
// real t1 = t1_1 + t1_2;
// real t2 = t2_1 + t2_2;
// tu00 += t0*u.x;
// tu10 += t1*u.x;
// tu01 += t0*u.y;
// tu20 += t2*u.x;
// tu11 += t1*u.y;
// tu02 += t0*u.z;
// tu30 += t3*u.x;
// tu21 += t2*u.y;
// tu12 += t1*u.z;
// tu03 += t0*u.w;
// }
// tuv100_1 += tu10_1*v.x;
// tuv010_1 += tu01_1*v.x;
// tuv001_1 += tu00_1*v.y;
// tuv200_1 += tu20_1*v.x;
// tuv020_1 += tu02_1*v.x;
// tuv002_1 += tu00_1*v.z;
// tuv110_1 += tu11_1*v.x;
// tuv101_1 += tu10_1*v.y;
// tuv011_1 += tu01_1*v.y;
// tuv100_2 += tu10_2*v.x;
// tuv010_2 += tu01_2*v.x;
// tuv001_2 += tu00_2*v.y;
// tuv200_2 += tu20_2*v.x;
// tuv020_2 += tu02_2*v.x;
// tuv002_2 += tu00_2*v.z;
// tuv110_2 += tu11_2*v.x;
// tuv101_2 += tu10_2*v.y;
// tuv011_2 += tu01_2*v.y;
// tuv000 += tu00*v.x;
// tuv100 += tu10*v.x;
// tuv010 += tu01*v.x;
// tuv001 += tu00*v.y;
// tuv200 += tu20*v.x;
// tuv020 += tu02*v.x;
// tuv002 += tu00*v.z;
// tuv110 += tu11*v.x;
// tuv101 += tu10*v.y;
// tuv011 += tu01*v.y;
// tuv300 += tu30*v.x;
// tuv030 += tu03*v.x;
// tuv003 += tu00*v.w;
// tuv210 += tu21*v.x;
// tuv201 += tu20*v.y;
// tuv120 += tu12*v.x;
// tuv021 += tu02*v.y;
// tuv102 += tu10*v.z;
// tuv012 += tu01*v.z;
// tuv111 += tu11*v.y;
// }
// phid[10*m] = 0;
// phid[10*m+1] = tuv100_1;
// phid[10*m+2] = tuv010_1;
// phid[10*m+3] = tuv001_1;
// phid[10*m+4] = tuv200_1;
// phid[10*m+5] = tuv020_1;
// phid[10*m+6] = tuv002_1;
// phid[10*m+7] = tuv110_1;
// phid[10*m+8] = tuv101_1;
// phid[10*m+9] = tuv011_1;
//
// phip[10*m] = 0;
// phip[10*m+1] = tuv100_2;
// phip[10*m+2] = tuv010_2;
// phip[10*m+3] = tuv001_2;
// phip[10*m+4] = tuv200_2;
// phip[10*m+5] = tuv020_2;
// phip[10*m+6] = tuv002_2;
// phip[10*m+7] = tuv110_2;
// phip[10*m+8] = tuv101_2;
// phip[10*m+9] = tuv011_2;
//
// phidp[20*m] = tuv000;
// phidp[20*m+1] = tuv100;
// phidp[20*m+2] = tuv010;
// phidp[20*m+3] = tuv001;
// phidp[20*m+4] = tuv200;
// phidp[20*m+5] = tuv020;
// phidp[20*m+6] = tuv002;
// phidp[20*m+7] = tuv110;
// phidp[20*m+8] = tuv101;
// phidp[20*m+9] = tuv011;
// phidp[20*m+10] = tuv300;
// phidp[20*m+11] = tuv030;
// phidp[20*m+12] = tuv003;
// phidp[20*m+13] = tuv210;
// phidp[20*m+14] = tuv201;
// phidp[20*m+15] = tuv120;
// phidp[20*m+16] = tuv021;
// phidp[20*m+17] = tuv102;
// phidp[20*m+18] = tuv012;
// phidp[20*m+19] = tuv111;
// }
//}
extern
"C"
__global__
void
computeInducedPotentialFromGrid
(
const
real2
*
__restrict__
pmeGrid
,
real
*
__restrict__
phid
,
real
*
__restrict__
phip
,
real
*
__restrict__
phidp
,
const
int4
*
__restrict__
igrid
,
const
real4
*
__restrict__
theta1
,
const
real4
*
__restrict__
theta2
,
const
real4
*
__restrict__
theta3
,
real4
invPeriodicBoxSize
)
{
// extract the induced dipole field at each site
for
(
int
m
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
m
<
NUM_ATOMS
;
m
+=
blockDim
.
x
*
gridDim
.
x
)
{
int4
gridPoint
=
igrid
[
m
];
real
tuv100_1
=
0
;
real
tuv010_1
=
0
;
real
tuv001_1
=
0
;
real
tuv200_1
=
0
;
real
tuv020_1
=
0
;
real
tuv002_1
=
0
;
real
tuv110_1
=
0
;
real
tuv101_1
=
0
;
real
tuv011_1
=
0
;
real
tuv100_2
=
0
;
real
tuv010_2
=
0
;
real
tuv001_2
=
0
;
real
tuv200_2
=
0
;
real
tuv020_2
=
0
;
real
tuv002_2
=
0
;
real
tuv110_2
=
0
;
real
tuv101_2
=
0
;
real
tuv011_2
=
0
;
real
tuv000
=
0
;
real
tuv001
=
0
;
real
tuv010
=
0
;
real
tuv100
=
0
;
real
tuv200
=
0
;
real
tuv020
=
0
;
real
tuv002
=
0
;
real
tuv110
=
0
;
real
tuv101
=
0
;
real
tuv011
=
0
;
real
tuv300
=
0
;
real
tuv030
=
0
;
real
tuv003
=
0
;
real
tuv210
=
0
;
real
tuv201
=
0
;
real
tuv120
=
0
;
real
tuv021
=
0
;
real
tuv102
=
0
;
real
tuv012
=
0
;
real
tuv111
=
0
;
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
k
=
gridPoint
.
z
+
iz
-
(
gridPoint
.
z
+
iz
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
real4
v
=
theta3
[
m
*
PME_ORDER
+
iz
];
real
tu00_1
=
0
;
real
tu01_1
=
0
;
real
tu10_1
=
0
;
real
tu20_1
=
0
;
real
tu11_1
=
0
;
real
tu02_1
=
0
;
real
tu00_2
=
0
;
real
tu01_2
=
0
;
real
tu10_2
=
0
;
real
tu20_2
=
0
;
real
tu11_2
=
0
;
real
tu02_2
=
0
;
real
tu00
=
0
;
real
tu10
=
0
;
real
tu01
=
0
;
real
tu20
=
0
;
real
tu11
=
0
;
real
tu02
=
0
;
real
tu30
=
0
;
real
tu21
=
0
;
real
tu12
=
0
;
real
tu03
=
0
;
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
j
=
gridPoint
.
y
+
iy
-
(
gridPoint
.
y
+
iy
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
real4
u
=
theta2
[
m
*
PME_ORDER
+
iy
];
real
t0_1
=
0
;
real
t1_1
=
0
;
real
t2_1
=
0
;
real
t0_2
=
0
;
real
t1_2
=
0
;
real
t2_2
=
0
;
real
t3
=
0
;
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
i
=
gridPoint
.
x
+
ix
-
(
gridPoint
.
x
+
ix
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
int
gridIndex
=
i
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
j
*
GRID_SIZE_Z
+
k
;
real2
tq
=
pmeGrid
[
gridIndex
];
real4
tadd
=
theta1
[
m
*
PME_ORDER
+
ix
];
t0_1
+=
tq
.
x
*
tadd
.
x
;
t1_1
+=
tq
.
x
*
tadd
.
y
;
t2_1
+=
tq
.
x
*
tadd
.
z
;
t0_2
+=
tq
.
y
*
tadd
.
x
;
t1_2
+=
tq
.
y
*
tadd
.
y
;
t2_2
+=
tq
.
y
*
tadd
.
z
;
t3
+=
(
tq
.
x
+
tq
.
y
)
*
tadd
.
w
;
}
tu00_1
+=
t0_1
*
u
.
x
;
tu10_1
+=
t1_1
*
u
.
x
;
tu01_1
+=
t0_1
*
u
.
y
;
tu20_1
+=
t2_1
*
u
.
x
;
tu11_1
+=
t1_1
*
u
.
y
;
tu02_1
+=
t0_1
*
u
.
z
;
tu00_2
+=
t0_2
*
u
.
x
;
tu10_2
+=
t1_2
*
u
.
x
;
tu01_2
+=
t0_2
*
u
.
y
;
tu20_2
+=
t2_2
*
u
.
x
;
tu11_2
+=
t1_2
*
u
.
y
;
tu02_2
+=
t0_2
*
u
.
z
;
real
t0
=
t0_1
+
t0_2
;
real
t1
=
t1_1
+
t1_2
;
real
t2
=
t2_1
+
t2_2
;
tu00
+=
t0
*
u
.
x
;
tu10
+=
t1
*
u
.
x
;
tu01
+=
t0
*
u
.
y
;
tu20
+=
t2
*
u
.
x
;
tu11
+=
t1
*
u
.
y
;
tu02
+=
t0
*
u
.
z
;
tu30
+=
t3
*
u
.
x
;
tu21
+=
t2
*
u
.
y
;
tu12
+=
t1
*
u
.
z
;
tu03
+=
t0
*
u
.
w
;
}
tuv100_1
+=
tu10_1
*
v
.
x
;
tuv010_1
+=
tu01_1
*
v
.
x
;
tuv001_1
+=
tu00_1
*
v
.
y
;
tuv200_1
+=
tu20_1
*
v
.
x
;
tuv020_1
+=
tu02_1
*
v
.
x
;
tuv002_1
+=
tu00_1
*
v
.
z
;
tuv110_1
+=
tu11_1
*
v
.
x
;
tuv101_1
+=
tu10_1
*
v
.
y
;
tuv011_1
+=
tu01_1
*
v
.
y
;
tuv100_2
+=
tu10_2
*
v
.
x
;
tuv010_2
+=
tu01_2
*
v
.
x
;
tuv001_2
+=
tu00_2
*
v
.
y
;
tuv200_2
+=
tu20_2
*
v
.
x
;
tuv020_2
+=
tu02_2
*
v
.
x
;
tuv002_2
+=
tu00_2
*
v
.
z
;
tuv110_2
+=
tu11_2
*
v
.
x
;
tuv101_2
+=
tu10_2
*
v
.
y
;
tuv011_2
+=
tu01_2
*
v
.
y
;
tuv000
+=
tu00
*
v
.
x
;
tuv100
+=
tu10
*
v
.
x
;
tuv010
+=
tu01
*
v
.
x
;
tuv001
+=
tu00
*
v
.
y
;
tuv200
+=
tu20
*
v
.
x
;
tuv020
+=
tu02
*
v
.
x
;
tuv002
+=
tu00
*
v
.
z
;
tuv110
+=
tu11
*
v
.
x
;
tuv101
+=
tu10
*
v
.
y
;
tuv011
+=
tu01
*
v
.
y
;
tuv300
+=
tu30
*
v
.
x
;
tuv030
+=
tu03
*
v
.
x
;
tuv003
+=
tu00
*
v
.
w
;
tuv210
+=
tu21
*
v
.
x
;
tuv201
+=
tu20
*
v
.
y
;
tuv120
+=
tu12
*
v
.
x
;
tuv021
+=
tu02
*
v
.
y
;
tuv102
+=
tu10
*
v
.
z
;
tuv012
+=
tu01
*
v
.
z
;
tuv111
+=
tu11
*
v
.
y
;
}
phid
[
10
*
m
]
=
0
;
phid
[
10
*
m
+
1
]
=
tuv100_1
;
phid
[
10
*
m
+
2
]
=
tuv010_1
;
phid
[
10
*
m
+
3
]
=
tuv001_1
;
phid
[
10
*
m
+
4
]
=
tuv200_1
;
phid
[
10
*
m
+
5
]
=
tuv020_1
;
phid
[
10
*
m
+
6
]
=
tuv002_1
;
phid
[
10
*
m
+
7
]
=
tuv110_1
;
phid
[
10
*
m
+
8
]
=
tuv101_1
;
phid
[
10
*
m
+
9
]
=
tuv011_1
;
phip
[
10
*
m
]
=
0
;
phip
[
10
*
m
+
1
]
=
tuv100_2
;
phip
[
10
*
m
+
2
]
=
tuv010_2
;
phip
[
10
*
m
+
3
]
=
tuv001_2
;
phip
[
10
*
m
+
4
]
=
tuv200_2
;
phip
[
10
*
m
+
5
]
=
tuv020_2
;
phip
[
10
*
m
+
6
]
=
tuv002_2
;
phip
[
10
*
m
+
7
]
=
tuv110_2
;
phip
[
10
*
m
+
8
]
=
tuv101_2
;
phip
[
10
*
m
+
9
]
=
tuv011_2
;
phidp
[
20
*
m
]
=
tuv000
;
phidp
[
20
*
m
+
1
]
=
tuv100
;
phidp
[
20
*
m
+
2
]
=
tuv010
;
phidp
[
20
*
m
+
3
]
=
tuv001
;
phidp
[
20
*
m
+
4
]
=
tuv200
;
phidp
[
20
*
m
+
5
]
=
tuv020
;
phidp
[
20
*
m
+
6
]
=
tuv002
;
phidp
[
20
*
m
+
7
]
=
tuv110
;
phidp
[
20
*
m
+
8
]
=
tuv101
;
phidp
[
20
*
m
+
9
]
=
tuv011
;
phidp
[
20
*
m
+
10
]
=
tuv300
;
phidp
[
20
*
m
+
11
]
=
tuv030
;
phidp
[
20
*
m
+
12
]
=
tuv003
;
phidp
[
20
*
m
+
13
]
=
tuv210
;
phidp
[
20
*
m
+
14
]
=
tuv201
;
phidp
[
20
*
m
+
15
]
=
tuv120
;
phidp
[
20
*
m
+
16
]
=
tuv021
;
phidp
[
20
*
m
+
17
]
=
tuv102
;
phidp
[
20
*
m
+
18
]
=
tuv012
;
phidp
[
20
*
m
+
19
]
=
tuv111
;
}
}
extern
"C"
__global__
void
computeFixedMultipoleForceAndEnergy
(
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
long
long
*
__restrict__
torqueBuffers
,
real
*
__restrict__
energyBuffer
,
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
phi
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
phi
_global
,
real4
invPeriodicBoxSize
)
{
real
multipole
[
10
];
const
int
deriv1
[]
=
{
1
,
4
,
7
,
8
,
10
,
15
,
17
,
13
,
14
,
19
};
const
int
deriv2
[]
=
{
2
,
7
,
5
,
9
,
13
,
11
,
18
,
15
,
19
,
16
};
...
...
@@ -700,22 +709,22 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
multipole
[
8
]
=
2
*
labFrameQuadrupole
[
i
*
5
+
2
];
multipole
[
9
]
=
2
*
labFrameQuadrupole
[
i
*
5
+
4
];
const
real
*
atomP
hi
=
&
phi
[
20
*
i
];
const
real
*
p
hi
=
&
phi
_global
[
20
*
i
];
torqueBuffers
[
i
]
=
(
long
long
)
(
EPSILON_FACTOR
*
(
multipole
[
3
]
*
yscale
*
atomP
hi
[
2
]
-
multipole
[
2
]
*
zscale
*
atomP
hi
[
3
]
+
2
*
(
multipole
[
6
]
-
multipole
[
5
])
*
yscale
*
zscale
*
atomP
hi
[
9
]
+
multipole
[
8
]
*
xscale
*
yscale
*
atomP
hi
[
7
]
+
multipole
[
9
]
*
yscale
*
yscale
*
atomP
hi
[
5
]
-
multipole
[
7
]
*
xscale
*
zscale
*
atomP
hi
[
8
]
-
multipole
[
9
]
*
zscale
*
zscale
*
atomP
hi
[
6
])
*
0xFFFFFFFF
);
torqueBuffers
[
i
]
=
(
long
long
)
(
EPSILON_FACTOR
*
(
multipole
[
3
]
*
yscale
*
p
hi
[
2
]
-
multipole
[
2
]
*
zscale
*
p
hi
[
3
]
+
2
*
(
multipole
[
6
]
-
multipole
[
5
])
*
yscale
*
zscale
*
p
hi
[
9
]
+
multipole
[
8
]
*
xscale
*
yscale
*
p
hi
[
7
]
+
multipole
[
9
]
*
yscale
*
yscale
*
p
hi
[
5
]
-
multipole
[
7
]
*
xscale
*
zscale
*
p
hi
[
8
]
-
multipole
[
9
]
*
zscale
*
zscale
*
p
hi
[
6
])
*
0xFFFFFFFF
);
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
]
=
(
long
long
)
(
EPSILON_FACTOR
*
(
multipole
[
1
]
*
zscale
*
atomP
hi
[
3
]
-
multipole
[
3
]
*
xscale
*
atomP
hi
[
1
]
+
2
*
(
multipole
[
4
]
-
multipole
[
6
])
*
xscale
*
zscale
*
atomP
hi
[
8
]
+
multipole
[
7
]
*
yscale
*
zscale
*
atomP
hi
[
9
]
+
multipole
[
8
]
*
zscale
*
zscale
*
atomP
hi
[
6
]
-
multipole
[
8
]
*
xscale
*
xscale
*
atomP
hi
[
4
]
-
multipole
[
9
]
*
xscale
*
yscale
*
atomP
hi
[
7
])
*
0xFFFFFFFF
);
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
]
=
(
long
long
)
(
EPSILON_FACTOR
*
(
multipole
[
1
]
*
zscale
*
p
hi
[
3
]
-
multipole
[
3
]
*
xscale
*
p
hi
[
1
]
+
2
*
(
multipole
[
4
]
-
multipole
[
6
])
*
xscale
*
zscale
*
p
hi
[
8
]
+
multipole
[
7
]
*
yscale
*
zscale
*
p
hi
[
9
]
+
multipole
[
8
]
*
zscale
*
zscale
*
p
hi
[
6
]
-
multipole
[
8
]
*
xscale
*
xscale
*
p
hi
[
4
]
-
multipole
[
9
]
*
xscale
*
yscale
*
p
hi
[
7
])
*
0xFFFFFFFF
);
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
*
2
]
=
(
long
long
)
(
EPSILON_FACTOR
*
(
multipole
[
2
]
*
xscale
*
atomP
hi
[
1
]
-
multipole
[
1
]
*
yscale
*
atomP
hi
[
2
]
+
2
*
(
multipole
[
5
]
-
multipole
[
4
])
*
xscale
*
yscale
*
atomP
hi
[
7
]
+
multipole
[
7
]
*
xscale
*
xscale
*
atomP
hi
[
4
]
+
multipole
[
9
]
*
xscale
*
zscale
*
atomP
hi
[
8
]
-
multipole
[
7
]
*
yscale
*
yscale
*
atomP
hi
[
5
]
-
multipole
[
8
]
*
yscale
*
zscale
*
atomP
hi
[
9
])
*
0xFFFFFFFF
);
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
*
2
]
=
(
long
long
)
(
EPSILON_FACTOR
*
(
multipole
[
2
]
*
xscale
*
p
hi
[
1
]
-
multipole
[
1
]
*
yscale
*
p
hi
[
2
]
+
2
*
(
multipole
[
5
]
-
multipole
[
4
])
*
xscale
*
yscale
*
p
hi
[
7
]
+
multipole
[
7
]
*
xscale
*
xscale
*
p
hi
[
4
]
+
multipole
[
9
]
*
xscale
*
zscale
*
p
hi
[
8
]
-
multipole
[
7
]
*
yscale
*
yscale
*
p
hi
[
5
]
-
multipole
[
8
]
*
yscale
*
zscale
*
p
hi
[
9
])
*
0xFFFFFFFF
);
// Compute the force and energy.
...
...
@@ -731,10 +740,10 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
real4
f
=
make_real4
(
0
,
0
,
0
,
0
);
for
(
int
k
=
0
;
k
<
10
;
k
++
)
{
energy
+=
multipole
[
k
]
*
atomP
hi
[
k
];
f
.
x
+=
multipole
[
k
]
*
atomP
hi
[
deriv1
[
k
]];
f
.
y
+=
multipole
[
k
]
*
atomP
hi
[
deriv2
[
k
]];
f
.
z
+=
multipole
[
k
]
*
atomP
hi
[
deriv3
[
k
]];
energy
+=
multipole
[
k
]
*
p
hi
[
k
];
f
.
x
+=
multipole
[
k
]
*
p
hi
[
deriv1
[
k
]];
f
.
y
+=
multipole
[
k
]
*
p
hi
[
deriv2
[
k
]];
f
.
z
+=
multipole
[
k
]
*
p
hi
[
deriv3
[
k
]];
}
f
.
x
*=
EPSILON_FACTOR
*
xscale
;
f
.
y
*=
EPSILON_FACTOR
*
yscale
;
...
...
@@ -746,141 +755,125 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
0.5
f
*
EPSILON_FACTOR
*
energy
;
}
//extern "C" __global__ void kComputeInducedDipoleForceAndEnergy_kernel() {
// real multipole[10];
// real inducedDipole[3];
// real inducedDipolePolar[3];
// real scales[3];
// const int deriv1[] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19};
// const int deriv2[] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16};
// const int deriv3[] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18};
// const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
// const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
// const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
// scales[0] = xscale;
// scales[1] = yscale;
// scales[2] = zscale;
// real energy = 0;
// for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
// // Compute the torque.
//
// multipole[0] = posq[i].w;
// multipole[1] = labFrameDipole[i*3];
// multipole[2] = labFrameDipole[i*3+1];
// multipole[3] = labFrameDipole[i*3+2];
// multipole[4] = labFrameQuadrupole[i*5];
// multipole[5] = labFrameQuadrupole[i*5+3];
// multipole[6] = -(multipole[4]+multipole[5]);
// multipole[7] = 2*labFrameQuadrupole[i*5+1];
// multipole[8] = 2*labFrameQuadrupole[i*5+2];
// multipole[9] = 2*labFrameQuadrupole[i*5+4];
// real* phidp = &cAmoebaSim.pPhidp[20*i];
//
// cAmoebaSim.pTorque[3*i] += 0.5f*EPSILON_FACTOR*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3]
// + 2*(multipole[6]-multipole[5])*yscale*zscale*phidp[9]
// + multipole[8]*xscale*yscale*phidp[7] + multipole[9]*yscale*yscale*phidp[5]
// - multipole[7]*xscale*zscale*phidp[8] - multipole[9]*zscale*zscale*phidp[6]);
//
// cAmoebaSim.pTorque[3*i+1] += 0.5f*EPSILON_FACTOR*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1]
// + 2*(multipole[4]-multipole[6])*xscale*zscale*phidp[8]
// + multipole[7]*yscale*zscale*phidp[9] + multipole[8]*zscale*zscale*phidp[6]
// - multipole[8]*xscale*xscale*phidp[4] - multipole[9]*xscale*yscale*phidp[7]);
//
// cAmoebaSim.pTorque[3*i+2] += 0.5f*EPSILON_FACTOR*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2]
// + 2*(multipole[5]-multipole[4])*xscale*yscale*phidp[7]
// + multipole[7]*xscale*xscale*phidp[4] + multipole[9]*xscale*zscale*phidp[8]
// - multipole[7]*yscale*yscale*phidp[5] - multipole[8]*yscale*zscale*phidp[9]);
//
// // Compute the force and energy.
//
// multipole[1] *= xscale;
// multipole[2] *= yscale;
// multipole[3] *= zscale;
// multipole[4] *= xscale*xscale;
// multipole[5] *= yscale*yscale;
// multipole[6] *= zscale*zscale;
// multipole[7] *= xscale*yscale;
// multipole[8] *= xscale*zscale;
// multipole[9] *= yscale*zscale;
//
// inducedDipole[0] = cAmoebaSim.pInducedDipole[i*3];
// inducedDipole[1] = cAmoebaSim.pInducedDipole[i*3+1];
// inducedDipole[2] = cAmoebaSim.pInducedDipole[i*3+2];
// inducedDipolePolar[0] = cAmoebaSim.pInducedDipolePolar[i*3];
// inducedDipolePolar[1] = cAmoebaSim.pInducedDipolePolar[i*3+1];
// inducedDipolePolar[2] = cAmoebaSim.pInducedDipolePolar[i*3+2];
// real* phi = &cAmoebaSim.pPhi[20*i];
// real* phip = &cAmoebaSim.pPhip[10*i];
// real* phid = &cAmoebaSim.pPhid[10*i];
// real4 f = make_real4(0, 0, 0, 0);
//
// energy += GRID_SIZE_X*invPeriodicBoxSize.x*inducedDipole[0]*phi[1];
// energy += GRID_SIZE_Y*invPeriodicBoxSize.y*inducedDipole[1]*phi[2];
// energy += GRID_SIZE_Z*invPeriodicBoxSize.z*inducedDipole[2]*phi[3];
//
// for (int k = 0; k < 3; k++) {
//
// int j1 = deriv1[k+1];
// int j2 = deriv2[k+1];
// int j3 = deriv3[k+1];
//
// f.x += (inducedDipole[k]+inducedDipolePolar[k])*phi[j1]*(scales[k]/xscale);
// f.y += (inducedDipole[k]+inducedDipolePolar[k])*phi[j2]*(scales[k]/yscale);
// f.z += (inducedDipole[k]+inducedDipolePolar[k])*phi[j3]*(scales[k]/zscale);
//
// if( cAmoebaSim.polarizationType == 0 )
// {
// f.x += (inducedDipole[k]*phip[j1] + inducedDipolePolar[k]*phid[j1])*(scales[k]/xscale);
// f.y += (inducedDipole[k]*phip[j2] + inducedDipolePolar[k]*phid[j2])*(scales[k]/yscale);
// f.z += (inducedDipole[k]*phip[j3] + inducedDipolePolar[k]*phid[j3])*(scales[k]/zscale);
// }
//
//
// }
//
// f.x *= GRID_SIZE_X*invPeriodicBoxSize.x;
// f.y *= GRID_SIZE_Y*invPeriodicBoxSize.y;
// f.z *= GRID_SIZE_Z*invPeriodicBoxSize.z;
// for (int k = 0; k < 10; k++) {
// f.x += multipole[k]*phidp[deriv1[k]];
// f.y += multipole[k]*phidp[deriv2[k]];
// f.z += multipole[k]*phidp[deriv3[k]];
// }
//
// f.x *= 0.5f*EPSILON_FACTOR*xscale;
// f.y *= 0.5f*EPSILON_FACTOR*yscale;
// f.z *= 0.5f*EPSILON_FACTOR*zscale;
//
// real4 force = cSim.pForce4[i];
// force.x -= f.x;
// force.y -= f.y;
// force.z -= f.z;
// cSim.pForce4[i] = force;
// }
// cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*EPSILON_FACTOR*energy;
//}
//
//extern "C" __global__ void kRecordFixedMultipoleField_kernel(real* output) {
// const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
// const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
// const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
// for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
// output[3*i] = -xscale*cAmoebaSim.pPhi[20*i+1];
// output[3*i+1] = -yscale*cAmoebaSim.pPhi[20*i+2];
// output[3*i+2] = -zscale*cAmoebaSim.pPhi[20*i+3];
// }
//}
//
//extern "C" __global__ void kRecordInducedDipoleField_kernel(real* output, real* outputPolar) {
// const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
// const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
// const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
// for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
// output[3*i] -= xscale*cAmoebaSim.pPhid[10*i+1];
// output[3*i+1] -= yscale*cAmoebaSim.pPhid[10*i+2];
// output[3*i+2] -= zscale*cAmoebaSim.pPhid[10*i+3];
// outputPolar[3*i] -= xscale*cAmoebaSim.pPhip[10*i+1];
// outputPolar[3*i+1] -= yscale*cAmoebaSim.pPhip[10*i+2];
// outputPolar[3*i+2] -= zscale*cAmoebaSim.pPhip[10*i+3];
// }
//}
extern
"C"
__global__
void
computeInducedDipoleForceAndEnergy
(
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
long
long
*
__restrict__
torqueBuffers
,
real
*
__restrict__
energyBuffer
,
const
real
*
__restrict__
labFrameDipole
,
const
real
*
__restrict__
labFrameQuadrupole
,
const
real
*
__restrict__
inducedDipole_global
,
const
real
*
__restrict__
inducedDipolePolar_global
,
const
real
*
__restrict__
phi_global
,
const
real
*
__restrict__
phid_global
,
const
real
*
__restrict__
phip_global
,
const
real
*
__restrict__
phidp_global
,
real4
invPeriodicBoxSize
)
{
real
multipole
[
10
];
real
inducedDipole
[
3
];
real
inducedDipolePolar
[
3
];
real
scales
[
3
];
const
int
deriv1
[]
=
{
1
,
4
,
7
,
8
,
10
,
15
,
17
,
13
,
14
,
19
};
const
int
deriv2
[]
=
{
2
,
7
,
5
,
9
,
13
,
11
,
18
,
15
,
19
,
16
};
const
int
deriv3
[]
=
{
3
,
8
,
9
,
6
,
14
,
16
,
12
,
19
,
17
,
18
};
const
real
xscale
=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
;
const
real
yscale
=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
;
const
real
zscale
=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
;
scales
[
0
]
=
xscale
;
scales
[
1
]
=
yscale
;
scales
[
2
]
=
zscale
;
real
energy
=
0
;
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Compute the torque.
multipole
[
0
]
=
posq
[
i
].
w
;
multipole
[
1
]
=
labFrameDipole
[
i
*
3
];
multipole
[
2
]
=
labFrameDipole
[
i
*
3
+
1
];
multipole
[
3
]
=
labFrameDipole
[
i
*
3
+
2
];
multipole
[
4
]
=
labFrameQuadrupole
[
i
*
5
];
multipole
[
5
]
=
labFrameQuadrupole
[
i
*
5
+
3
];
multipole
[
6
]
=
-
(
multipole
[
4
]
+
multipole
[
5
]);
multipole
[
7
]
=
2
*
labFrameQuadrupole
[
i
*
5
+
1
];
multipole
[
8
]
=
2
*
labFrameQuadrupole
[
i
*
5
+
2
];
multipole
[
9
]
=
2
*
labFrameQuadrupole
[
i
*
5
+
4
];
const
real
*
phidp
=
&
phidp_global
[
20
*
i
];
torqueBuffers
[
i
]
+=
(
long
long
)
(
0.5
f
*
EPSILON_FACTOR
*
(
multipole
[
3
]
*
yscale
*
phidp
[
2
]
-
multipole
[
2
]
*
zscale
*
phidp
[
3
]
+
2
*
(
multipole
[
6
]
-
multipole
[
5
])
*
yscale
*
zscale
*
phidp
[
9
]
+
multipole
[
8
]
*
xscale
*
yscale
*
phidp
[
7
]
+
multipole
[
9
]
*
yscale
*
yscale
*
phidp
[
5
]
-
multipole
[
7
]
*
xscale
*
zscale
*
phidp
[
8
]
-
multipole
[
9
]
*
zscale
*
zscale
*
phidp
[
6
])
*
0xFFFFFFFF
);
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
]
+=
(
long
long
)
(
0.5
f
*
EPSILON_FACTOR
*
(
multipole
[
1
]
*
zscale
*
phidp
[
3
]
-
multipole
[
3
]
*
xscale
*
phidp
[
1
]
+
2
*
(
multipole
[
4
]
-
multipole
[
6
])
*
xscale
*
zscale
*
phidp
[
8
]
+
multipole
[
7
]
*
yscale
*
zscale
*
phidp
[
9
]
+
multipole
[
8
]
*
zscale
*
zscale
*
phidp
[
6
]
-
multipole
[
8
]
*
xscale
*
xscale
*
phidp
[
4
]
-
multipole
[
9
]
*
xscale
*
yscale
*
phidp
[
7
])
*
0xFFFFFFFF
);
torqueBuffers
[
i
+
PADDED_NUM_ATOMS
*
2
]
+=
(
long
long
)
(
0.5
f
*
EPSILON_FACTOR
*
(
multipole
[
2
]
*
xscale
*
phidp
[
1
]
-
multipole
[
1
]
*
yscale
*
phidp
[
2
]
+
2
*
(
multipole
[
5
]
-
multipole
[
4
])
*
xscale
*
yscale
*
phidp
[
7
]
+
multipole
[
7
]
*
xscale
*
xscale
*
phidp
[
4
]
+
multipole
[
9
]
*
xscale
*
zscale
*
phidp
[
8
]
-
multipole
[
7
]
*
yscale
*
yscale
*
phidp
[
5
]
-
multipole
[
8
]
*
yscale
*
zscale
*
phidp
[
9
])
*
0xFFFFFFFF
);
// Compute the force and energy.
multipole
[
1
]
*=
xscale
;
multipole
[
2
]
*=
yscale
;
multipole
[
3
]
*=
zscale
;
multipole
[
4
]
*=
xscale
*
xscale
;
multipole
[
5
]
*=
yscale
*
yscale
;
multipole
[
6
]
*=
zscale
*
zscale
;
multipole
[
7
]
*=
xscale
*
yscale
;
multipole
[
8
]
*=
xscale
*
zscale
;
multipole
[
9
]
*=
yscale
*
zscale
;
inducedDipole
[
0
]
=
inducedDipole_global
[
i
*
3
];
inducedDipole
[
1
]
=
inducedDipole_global
[
i
*
3
+
1
];
inducedDipole
[
2
]
=
inducedDipole_global
[
i
*
3
+
2
];
inducedDipolePolar
[
0
]
=
inducedDipolePolar_global
[
i
*
3
];
inducedDipolePolar
[
1
]
=
inducedDipolePolar_global
[
i
*
3
+
1
];
inducedDipolePolar
[
2
]
=
inducedDipolePolar_global
[
i
*
3
+
2
];
const
real
*
phi
=
&
phi_global
[
20
*
i
];
const
real
*
phip
=
&
phip_global
[
10
*
i
];
const
real
*
phid
=
&
phid_global
[
10
*
i
];
real4
f
=
make_real4
(
0
,
0
,
0
,
0
);
energy
+=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
*
inducedDipole
[
0
]
*
phi
[
1
];
energy
+=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
*
inducedDipole
[
1
]
*
phi
[
2
];
energy
+=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
*
inducedDipole
[
2
]
*
phi
[
3
];
for
(
int
k
=
0
;
k
<
3
;
k
++
)
{
int
j1
=
deriv1
[
k
+
1
];
int
j2
=
deriv2
[
k
+
1
];
int
j3
=
deriv3
[
k
+
1
];
f
.
x
+=
(
inducedDipole
[
k
]
+
inducedDipolePolar
[
k
])
*
phi
[
j1
]
*
(
scales
[
k
]
/
xscale
);
f
.
y
+=
(
inducedDipole
[
k
]
+
inducedDipolePolar
[
k
])
*
phi
[
j2
]
*
(
scales
[
k
]
/
yscale
);
f
.
z
+=
(
inducedDipole
[
k
]
+
inducedDipolePolar
[
k
])
*
phi
[
j3
]
*
(
scales
[
k
]
/
zscale
);
#ifndef DIRECT_POLARIZATION
f
.
x
+=
(
inducedDipole
[
k
]
*
phip
[
j1
]
+
inducedDipolePolar
[
k
]
*
phid
[
j1
])
*
(
scales
[
k
]
/
xscale
);
f
.
y
+=
(
inducedDipole
[
k
]
*
phip
[
j2
]
+
inducedDipolePolar
[
k
]
*
phid
[
j2
])
*
(
scales
[
k
]
/
yscale
);
f
.
z
+=
(
inducedDipole
[
k
]
*
phip
[
j3
]
+
inducedDipolePolar
[
k
]
*
phid
[
j3
])
*
(
scales
[
k
]
/
zscale
);
#endif
}
f
.
x
*=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
;
f
.
y
*=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
;
f
.
z
*=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
;
for
(
int
k
=
0
;
k
<
10
;
k
++
)
{
f
.
x
+=
multipole
[
k
]
*
phidp
[
deriv1
[
k
]];
f
.
y
+=
multipole
[
k
]
*
phidp
[
deriv2
[
k
]];
f
.
z
+=
multipole
[
k
]
*
phidp
[
deriv3
[
k
]];
}
f
.
x
*=
0.5
f
*
EPSILON_FACTOR
*
xscale
;
f
.
y
*=
0.5
f
*
EPSILON_FACTOR
*
yscale
;
f
.
z
*=
0.5
f
*
EPSILON_FACTOR
*
zscale
;
forceBuffers
[
i
]
-=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f
.
x
*
0xFFFFFFFF
));
forceBuffers
[
i
+
PADDED_NUM_ATOMS
]
-=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f
.
y
*
0xFFFFFFFF
));
forceBuffers
[
i
+
PADDED_NUM_ATOMS
*
2
]
-=
static_cast
<
unsigned
long
long
>
((
long
long
)
(
f
.
z
*
0xFFFFFFFF
));
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
0.5
f
*
EPSILON_FACTOR
*
energy
;
}
extern
"C"
__global__
void
recordInducedFieldDipoles
(
const
real
*
__restrict__
phid
,
real
*
const
__restrict__
phip
,
real
*
__restrict__
inducedDipole
,
real
*
__restrict__
inducedDipolePolar
,
real4
invPeriodicBoxSize
)
{
real
xscale
=
GRID_SIZE_X
*
invPeriodicBoxSize
.
x
;
real
yscale
=
GRID_SIZE_Y
*
invPeriodicBoxSize
.
y
;
real
zscale
=
GRID_SIZE_Z
*
invPeriodicBoxSize
.
z
;
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
inducedDipole
[
3
*
i
]
-=
xscale
*
phid
[
10
*
i
+
1
];
inducedDipole
[
3
*
i
+
1
]
-=
yscale
*
phid
[
10
*
i
+
2
];
inducedDipole
[
3
*
i
+
2
]
-=
zscale
*
phid
[
10
*
i
+
3
];
inducedDipolePolar
[
3
*
i
]
-=
xscale
*
phip
[
10
*
i
+
1
];
inducedDipolePolar
[
3
*
i
+
1
]
-=
yscale
*
phip
[
10
*
i
+
2
];
inducedDipolePolar
[
3
*
i
+
2
]
-=
zscale
*
phip
[
10
*
i
+
3
];
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment