Commit f352d116 authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing to convert AmoebaMultipoleForce: PME with direct polarization now works

parent 7a60fd73
......@@ -1166,11 +1166,13 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
pmeUpdateBsplinesKernel = cu.getKernel(module, "updateBsplines");
pmeAtomRangeKernel = cu.getKernel(module, "findAtomRangeForGrid");
pmeSpreadFixedMultipolesKernel = cu.getKernel(module, "gridSpreadFixedMultipoles");
pmeSpreadInducedDipolesKernel = cu.getKernel(module, "gridSpreadInducedDipoles");
pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeFixedPotentialKernel = cu.getKernel(module, "computeFixedPotentialFromGrid");
pmeInducedPotentialKernel = cu.getKernel(module, "computeInducedPotentialFromGrid");
pmeFixedForceKernel = cu.getKernel(module, "computeFixedMultipoleForceAndEnergy");
// pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
// pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
pmeInducedForceKernel = cu.getKernel(module, "computeInducedDipoleForceAndEnergy");
pmeRecordInducedFieldDipolesKernel = cu.getKernel(module, "recordInducedFieldDipoles");
// cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
// Create required data structures.
......@@ -1415,12 +1417,15 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
&inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
// Map torques to force.
void* mapTorqueArgs[] = {&cu.getForce().getDevicePointer(), &torque->getDevicePointer(),
&cu.getPosq().getDevicePointer(), &multipoleParticles->getDevicePointer()};
cu.executeKernel(mapTorqueKernel, mapTorqueArgs, cu.getNumAtoms());
}
else {
// Compute induced dipoles.
// Reciprocal space calculation.
unsigned int maxTiles = nb.getInteractingTiles().getSize();
void* pmeUpdateBsplinesArgs[] = {&cu.getPosq().getDevicePointer(), &pmeIgrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),
......@@ -1433,9 +1438,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cu.executeKernel(pmeAtomRangeKernel, pmeAtomRangeArgs, cu.getNumAtoms(), cu.ThreadBlockSize, cu.ThreadBlockSize*PmeOrder*PmeOrder*elementSize);
void* pmeSpreadFixedMultipolesArgs[] = {&cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
&pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), &pmeAtomRange->getDevicePointer(),
&pmeTheta1->getDevicePointer(), &pmeTheta2->getDevicePointer(), &pmeTheta3->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeSpreadFixedMultipolesKernel, pmeSpreadFixedMultipolesArgs, cu.getNumAtoms(), cu.ThreadBlockSize, cu.ThreadBlockSize*PmeOrder*PmeOrder*elementSize);
&pmeTheta1->getDevicePointer(), &pmeTheta2->getDevicePointer(), &pmeTheta3->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeSpreadFixedMultipolesKernel, pmeSpreadFixedMultipolesArgs, cu.getNumAtoms());
if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
else
......@@ -1448,23 +1452,16 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
else
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
void* pmeFixedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhi->getDevicePointer(), &field->getDevicePointer(),
&pmeIgrid->getDevicePointer(), &pmeTheta1->getDevicePointer(), &pmeTheta2->getDevicePointer(), &pmeTheta3->getDevicePointer(),
&labFrameDipoles->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()};
&fieldPolar ->getDevicePointer(), &pmeIgrid->getDevicePointer(), &pmeTheta1->getDevicePointer(), &pmeTheta2->getDevicePointer(),
&pmeTheta3->getDevicePointer(), &labFrameDipoles->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeFixedPotentialKernel, pmeFixedPotentialArgs, cu.getNumAtoms());
void* pmeFixedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque->getDevicePointer(),
&cu.getEnergyBuffer().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
&pmePhi->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
&pmePhi->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeFixedForceKernel, pmeFixedForceArgs, cu.getNumAtoms());
printf("reciprocal:\n");
vector<long long> f;
printf("force\n");
cu.getForce().download(f);
for (int i = 0; i < cu.getNumAtoms(); i++)
printf("%d: %g %g %g\n", i, f[i]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()*2]/(double) 0xFFFFFFFF);
// printf("torque\n");
// torque->download(f);
// for (int i = 0; i < cu.getNumAtoms(); i++)
// printf("%d: %g %g %g\n", i, f[i]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()*2]/(double) 0xFFFFFFFF);
// Direct space calculation.
void* computeFixedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
&nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(),
&covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
......@@ -1475,38 +1472,29 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &polarizability->getDevicePointer()};
cu.executeKernel(recordInducedDipolesKernel, recordInducedDipolesArgs, cu.getNumAtoms());
printf("direct:\n");
printf("force\n");
cu.getForce().download(f);
for (int i = 0; i < cu.getNumAtoms(); i++)
printf("%d: %g %g %g\n", i, f[i]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()*2]/(double) 0xFFFFFFFF);
// printf("torque\n");
// torque->download(f);
// for (int i = 0; i < cu.getNumAtoms(); i++)
// printf("%d: %g %g %g\n", i, f[i]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()*2]/(double) 0xFFFFFFFF);
// vector<float> d, dp;
// printf("phi\n");
// pmePhi->download(d);
// for (int i = 0; i < d.size(); i++)
// printf("%d: %g\n", i, d[i]);
// printf("dipoles\n");
// labFrameDipoles->download(d);
// for (int i = 0; i < cu.getNumAtoms(); i++)
// printf("%d: %g %g %g\n", i, d[3*i], d[3*i+1], d[3*i+2]);
// printf("quadrupoles\n");
// labFrameQuadrupoles->download(d);
// for (int i = 0; i < cu.getNumAtoms(); i++)
// printf("%d: %g %g %g %g %g %g\n", i, d[5*i], d[5*i+1], d[5*i+2], d[5*i+3], d[5*i+4], -(d[5*i]+d[5*i+3]));
// printf("induced dipoles\n");
// inducedDipole->download(d);
// inducedDipolePolar->download(dp);
// for (int i = 0; i < cu.getNumAtoms(); i++)
// printf("%d: %g %g %g, %g %g %g\n", i, d[3*i], d[3*i+1], d[3*i+2], dp[3*i], dp[3*i+1], dp[3*i+2]);
// printf("positions\n");
// vector<float4> p;
// cu.getPosq().download(p);
// for (int i = 0; i < cu.getNumAtoms(); i++)
// printf("%d: %g %g %g %g\n", i, p[i].x, p[i].y, p[i].z, p[i].w);
// Reciprocal space calculation for the induced dipoles.
void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
&pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), &pmeAtomRange->getDevicePointer(),
&pmeTheta1->getDevicePointer(), &pmeTheta2->getDevicePointer(), &pmeTheta3->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
else
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, cu.getNumAtoms());
if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
else
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
&pmePhidp->getDevicePointer(), &pmeIgrid->getDevicePointer(), &pmeTheta1->getDevicePointer(), &pmeTheta2->getDevicePointer(),
&pmeTheta3->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms());
// void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
// &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()};
// cu.executeKernel(pmeRecordInducedFieldDipolesKernel, pmeRecordInducedFieldDipolesArgs, cu.getNumAtoms());
// vector<float2> errors;
......@@ -1541,11 +1529,14 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
&inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
printf("electrostatic:\n");
printf("force\n");
cu.getForce().download(f);
for (int i = 0; i < cu.getNumAtoms(); i++)
printf("%d: %g %g %g\n", i, f[i]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()]/(double) 0xFFFFFFFF, f[i+cu.getPaddedNumAtoms()*2]/(double) 0xFFFFFFFF);
void* pmeInducedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque->getDevicePointer(),
&cu.getEnergyBuffer().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &pmePhi->getDevicePointer(), &pmePhid->getDevicePointer(),
&pmePhip->getDevicePointer(), &pmePhidp->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeInducedForceKernel, pmeInducedForceArgs, cu.getNumAtoms());
// Map torques to force.
void* mapTorqueArgs[] = {&cu.getForce().getDevicePointer(), &torque->getDevicePointer(),
&cu.getPosq().getDevicePointer(), &multipoleParticles->getDevicePointer()};
cu.executeKernel(mapTorqueKernel, mapTorqueArgs, cu.getNumAtoms());
......
......@@ -425,7 +425,8 @@ private:
CudaSort* sort;
cufftHandle fft;
CUfunction computeMomentsKernel, recordInducedDipolesKernel, computeFixedFieldKernel, computeInducedFieldKernel, updateInducedFieldKernel, electrostaticsKernel, mapTorqueKernel;
CUfunction pmeUpdateBsplinesKernel, pmeAtomRangeKernel, pmeSpreadFixedMultipolesKernel, pmeConvolutionKernel, pmeFixedPotentialKernel, pmeFixedForceKernel;
CUfunction pmeUpdateBsplinesKernel, pmeAtomRangeKernel, pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeConvolutionKernel, pmeFixedPotentialKernel, pmeInducedPotentialKernel;
CUfunction pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel;
static const int PmeOrder = 5;
};
......
......@@ -161,8 +161,7 @@ extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIn
}
extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ posq, const real* __restrict__ labFrameDipole,
const real* __restrict__ labFrameQuadrupole, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange,
const real4* __restrict__ theta1, const real4* __restrict__ theta2, const real4* __restrict__ theta3,
real4 periodicBoxSize, real4 invPeriodicBoxSize) {
const real4* __restrict__ theta1, const real4* __restrict__ theta2, const real4* __restrict__ theta3, real4 invPeriodicBoxSize) {
const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
......@@ -248,88 +247,90 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
}
}
//extern "C" __global__ void kGridSpreadInducedDipoles_kernel() {
// const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
// const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
// const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
// unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
// unsigned int numThreads = gridDim.x*blockDim.x;
// for (int gridIndex = blockIdx.x*blockDim.x+threadIdx.x; gridIndex < numGridPoints; gridIndex += numThreads) {
// int3 gridPoint;
// gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
// int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
// gridPoint.y = remainder/GRID_SIZE_Z;
// gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
// real2 result = make_real2(0, 0);
// for (int ix = 0; ix < PME_ORDER; ++ix) {
// int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
// for (int iy = 0; iy < PME_ORDER; ++iy) {
// int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y);
// int z1 = gridPoint.z-PME_ORDER+1;
// z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
// int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1);
// int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1;
// int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
// int firstAtom = pmeAtomRange[gridIndex1];
// int lastAtom = pmeAtomRange[gridIndex2+1];
// for (int i = firstAtom; i < lastAtom; ++i) {
// int2 atomData = pmeAtomGridIndex[i];
// int atomIndex = atomData.x;
// int z = atomData.y;
// int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
// if (iz >= GRID_SIZE_Z)
// iz -= GRID_SIZE_Z;
// real inducedDipoleX = xscale*cAmoebaSim.pInducedDipole[atomIndex*3];
// real inducedDipoleY = yscale*cAmoebaSim.pInducedDipole[atomIndex*3+1];
// real inducedDipoleZ = zscale*cAmoebaSim.pInducedDipole[atomIndex*3+2];
// real inducedDipolePolarX = xscale*cAmoebaSim.pInducedDipolePolar[atomIndex*3];
// real inducedDipolePolarY = yscale*cAmoebaSim.pInducedDipolePolar[atomIndex*3+1];
// real inducedDipolePolarZ = zscale*cAmoebaSim.pInducedDipolePolar[atomIndex*3+2];
// real4 t = theta1[atomIndex*PME_ORDER+ix];
// real4 u = theta2[atomIndex*PME_ORDER+iy];
// real4 v = theta3[atomIndex*PME_ORDER+iz];
// real term01 = inducedDipoleY*u.y*v.x + inducedDipoleZ*u.x*v.y;
// real term11 = inducedDipoleX*u.x*v.x;
// real term02 = inducedDipolePolarY*u.y*v.x + inducedDipolePolarZ*u.x*v.y;
// real term12 = inducedDipolePolarX*u.x*v.x;
// result.x += term01*t.x + term11*t.y;
// result.y += term02*t.x + term12*t.y;
// }
// if (z1 > gridPoint.z) {
// gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z;
// gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z;
// firstAtom = pmeAtomRange[gridIndex1];
// lastAtom = pmeAtomRange[gridIndex2+1];
// for (int i = firstAtom; i < lastAtom; ++i) {
// int2 atomData = pmeAtomGridIndex[i];
// int atomIndex = atomData.x;
// int z = atomData.y;
// int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
// if (iz >= GRID_SIZE_Z)
// iz -= GRID_SIZE_Z;
// real inducedDipoleX = xscale*cAmoebaSim.pInducedDipole[atomIndex*3];
// real inducedDipoleY = yscale*cAmoebaSim.pInducedDipole[atomIndex*3+1];
// real inducedDipoleZ = zscale*cAmoebaSim.pInducedDipole[atomIndex*3+2];
// real inducedDipolePolarX = xscale*cAmoebaSim.pInducedDipolePolar[atomIndex*3];
// real inducedDipolePolarY = yscale*cAmoebaSim.pInducedDipolePolar[atomIndex*3+1];
// real inducedDipolePolarZ = zscale*cAmoebaSim.pInducedDipolePolar[atomIndex*3+2];
// real4 t = theta1[atomIndex*PME_ORDER+ix];
// real4 u = theta2[atomIndex*PME_ORDER+iy];
// real4 v = theta3[atomIndex*PME_ORDER+iz];
// real term01 = inducedDipoleY*u.y*v.x + inducedDipoleZ*u.x*v.y;
// real term11 = inducedDipoleX*u.x*v.x;
// real term02 = inducedDipolePolarY*u.y*v.x + inducedDipolePolarZ*u.x*v.y;
// real term12 = inducedDipolePolarX*u.x*v.x;
// result.x += term01*t.x + term11*t.y;
// result.y += term02*t.x + term12*t.y;
// }
// }
// }
// }
// pmeGrid[gridIndex] = result;
// }
//}
//
extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ posq, const real* __restrict__ inducedDipole,
const real* __restrict__ inducedDipolePolar, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange,
const real4* __restrict__ theta1, const real4* __restrict__ theta2, const real4* __restrict__ theta3, real4 invPeriodicBoxSize) {
const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
unsigned int numThreads = gridDim.x*blockDim.x;
for (int gridIndex = blockIdx.x*blockDim.x+threadIdx.x; gridIndex < numGridPoints; gridIndex += numThreads) {
int3 gridPoint;
gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
gridPoint.y = remainder/GRID_SIZE_Z;
gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
real2 result = make_real2(0, 0);
for (int ix = 0; ix < PME_ORDER; ++ix) {
int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
for (int iy = 0; iy < PME_ORDER; ++iy) {
int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y);
int z1 = gridPoint.z-PME_ORDER+1;
z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1);
int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1;
int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
int firstAtom = pmeAtomRange[gridIndex1];
int lastAtom = pmeAtomRange[gridIndex2+1];
for (int i = firstAtom; i < lastAtom; ++i) {
int2 atomData = pmeAtomGridIndex[i];
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
if (iz >= GRID_SIZE_Z)
iz -= GRID_SIZE_Z;
real inducedDipoleX = xscale*inducedDipole[atomIndex*3];
real inducedDipoleY = yscale*inducedDipole[atomIndex*3+1];
real inducedDipoleZ = zscale*inducedDipole[atomIndex*3+2];
real inducedDipolePolarX = xscale*inducedDipolePolar[atomIndex*3];
real inducedDipolePolarY = yscale*inducedDipolePolar[atomIndex*3+1];
real inducedDipolePolarZ = zscale*inducedDipolePolar[atomIndex*3+2];
real4 t = theta1[atomIndex*PME_ORDER+ix];
real4 u = theta2[atomIndex*PME_ORDER+iy];
real4 v = theta3[atomIndex*PME_ORDER+iz];
real term01 = inducedDipoleY*u.y*v.x + inducedDipoleZ*u.x*v.y;
real term11 = inducedDipoleX*u.x*v.x;
real term02 = inducedDipolePolarY*u.y*v.x + inducedDipolePolarZ*u.x*v.y;
real term12 = inducedDipolePolarX*u.x*v.x;
result.x += term01*t.x + term11*t.y;
result.y += term02*t.x + term12*t.y;
}
if (z1 > gridPoint.z) {
gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z;
gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z;
firstAtom = pmeAtomRange[gridIndex1];
lastAtom = pmeAtomRange[gridIndex2+1];
for (int i = firstAtom; i < lastAtom; ++i) {
int2 atomData = pmeAtomGridIndex[i];
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
if (iz >= GRID_SIZE_Z)
iz -= GRID_SIZE_Z;
real inducedDipoleX = xscale*inducedDipole[atomIndex*3];
real inducedDipoleY = yscale*inducedDipole[atomIndex*3+1];
real inducedDipoleZ = zscale*inducedDipole[atomIndex*3+2];
real inducedDipolePolarX = xscale*inducedDipolePolar[atomIndex*3];
real inducedDipolePolarY = yscale*inducedDipolePolar[atomIndex*3+1];
real inducedDipolePolarZ = zscale*inducedDipolePolar[atomIndex*3+2];
real4 t = theta1[atomIndex*PME_ORDER+ix];
real4 u = theta2[atomIndex*PME_ORDER+iy];
real4 v = theta3[atomIndex*PME_ORDER+iz];
real term01 = inducedDipoleY*u.y*v.x + inducedDipoleZ*u.x*v.y;
real term11 = inducedDipoleX*u.x*v.x;
real term02 = inducedDipolePolarY*u.y*v.x + inducedDipolePolarZ*u.x*v.y;
real term12 = inducedDipolePolarX*u.x*v.x;
result.x += term01*t.x + term11*t.y;
result.y += term02*t.x + term12*t.y;
}
}
}
}
pmeGrid[gridIndex] = result;
}
}
extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, const real* __restrict__ pmeBsplineModuliX,
const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
......@@ -362,7 +363,7 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
}
extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phi,
long long* __restrict__ fieldBuffers, const int4* __restrict__ igrid, const real4* __restrict__ theta1,
long long* __restrict__ fieldBuffers, long long* __restrict__ fieldPolarBuffers, const int4* __restrict__ igrid, const real4* __restrict__ theta1,
const real4* __restrict__ theta2, const real4* __restrict__ theta3, const real* __restrict__ labFrameDipole, real4 invPeriodicBoxSize) {
// extract the permanent multipole field at each site
......@@ -468,216 +469,224 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
phi[20*m+18] = tuv012;
phi[20*m+19] = tuv111;
real dipoleScale = (4/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT(M_PI);
fieldBuffers[m] = (long long) ((dipoleScale*labFrameDipole[m*3]-GRID_SIZE_X*invPeriodicBoxSize.x*tuv100)*0xFFFFFFFF);
fieldBuffers[m+PADDED_NUM_ATOMS] = (long long) ((dipoleScale*labFrameDipole[m*3+1]-GRID_SIZE_Y*invPeriodicBoxSize.y*tuv010)*0xFFFFFFFF);
fieldBuffers[m+2*PADDED_NUM_ATOMS] = (long long) ((dipoleScale*labFrameDipole[m*3+2]-GRID_SIZE_Z*invPeriodicBoxSize.z*tuv001)*0xFFFFFFFF);
long long fieldx = (long long) ((dipoleScale*labFrameDipole[m*3]-GRID_SIZE_X*invPeriodicBoxSize.x*tuv100)*0xFFFFFFFF);
fieldBuffers[m] = fieldx;
fieldPolarBuffers[m] = fieldx;
long long fieldy = (long long) ((dipoleScale*labFrameDipole[m*3+1]-GRID_SIZE_Y*invPeriodicBoxSize.y*tuv010)*0xFFFFFFFF);
fieldBuffers[m+PADDED_NUM_ATOMS] = fieldy;
fieldPolarBuffers[m+PADDED_NUM_ATOMS] = fieldy;
long long fieldz = (long long) ((dipoleScale*labFrameDipole[m*3+2]-GRID_SIZE_Z*invPeriodicBoxSize.z*tuv001)*0xFFFFFFFF);
fieldBuffers[m+2*PADDED_NUM_ATOMS] = fieldz;
fieldPolarBuffers[m+2*PADDED_NUM_ATOMS] = fieldz;
}
}
//extern "C" __global__ void kComputeInducedPotentialFromGrid_kernel() {
// // extract the induced dipole field at each site
//
// for (int m = blockIdx.x*blockDim.x+threadIdx.x; m < NUM_ATOMS; m += blockDim.x*gridDim.x) {
// int4 gridPoint = igrid[m];
// real tuv100_1 = 0;
// real tuv010_1 = 0;
// real tuv001_1 = 0;
// real tuv200_1 = 0;
// real tuv020_1 = 0;
// real tuv002_1 = 0;
// real tuv110_1 = 0;
// real tuv101_1 = 0;
// real tuv011_1 = 0;
// real tuv100_2 = 0;
// real tuv010_2 = 0;
// real tuv001_2 = 0;
// real tuv200_2 = 0;
// real tuv020_2 = 0;
// real tuv002_2 = 0;
// real tuv110_2 = 0;
// real tuv101_2 = 0;
// real tuv011_2 = 0;
// real tuv000 = 0;
// real tuv001 = 0;
// real tuv010 = 0;
// real tuv100 = 0;
// real tuv200 = 0;
// real tuv020 = 0;
// real tuv002 = 0;
// real tuv110 = 0;
// real tuv101 = 0;
// real tuv011 = 0;
// real tuv300 = 0;
// real tuv030 = 0;
// real tuv003 = 0;
// real tuv210 = 0;
// real tuv201 = 0;
// real tuv120 = 0;
// real tuv021 = 0;
// real tuv102 = 0;
// real tuv012 = 0;
// real tuv111 = 0;
// for (int iz = 0; iz < PME_ORDER; iz++) {
// int k = gridPoint.z+iz-(gridPoint.z+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
// real4 v = theta3[m*PME_ORDER+iz];
// real tu00_1 = 0;
// real tu01_1 = 0;
// real tu10_1 = 0;
// real tu20_1 = 0;
// real tu11_1 = 0;
// real tu02_1 = 0;
// real tu00_2 = 0;
// real tu01_2 = 0;
// real tu10_2 = 0;
// real tu20_2 = 0;
// real tu11_2 = 0;
// real tu02_2 = 0;
// real tu00 = 0;
// real tu10 = 0;
// real tu01 = 0;
// real tu20 = 0;
// real tu11 = 0;
// real tu02 = 0;
// real tu30 = 0;
// real tu21 = 0;
// real tu12 = 0;
// real tu03 = 0;
// for (int iy = 0; iy < PME_ORDER; iy++) {
// int j = gridPoint.y+iy-(gridPoint.y+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
// real4 u = theta2[m*PME_ORDER+iy];
// real t0_1 = 0;
// real t1_1 = 0;
// real t2_1 = 0;
// real t0_2 = 0;
// real t1_2 = 0;
// real t2_2 = 0;
// real t3 = 0;
// for (int ix = 0; ix < PME_ORDER; ix++) {
// int i = gridPoint.x+ix-(gridPoint.x+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
// int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k;
// real2 tq = pmeGrid[gridIndex];
// real4 tadd = theta1[m*PME_ORDER+ix];
// t0_1 += tq.x*tadd.x;
// t1_1 += tq.x*tadd.y;
// t2_1 += tq.x*tadd.z;
// t0_2 += tq.y*tadd.x;
// t1_2 += tq.y*tadd.y;
// t2_2 += tq.y*tadd.z;
// t3 += (tq.x+tq.y)*tadd.w;
// }
// tu00_1 += t0_1*u.x;
// tu10_1 += t1_1*u.x;
// tu01_1 += t0_1*u.y;
// tu20_1 += t2_1*u.x;
// tu11_1 += t1_1*u.y;
// tu02_1 += t0_1*u.z;
// tu00_2 += t0_2*u.x;
// tu10_2 += t1_2*u.x;
// tu01_2 += t0_2*u.y;
// tu20_2 += t2_2*u.x;
// tu11_2 += t1_2*u.y;
// tu02_2 += t0_2*u.z;
// real t0 = t0_1 + t0_2;
// real t1 = t1_1 + t1_2;
// real t2 = t2_1 + t2_2;
// tu00 += t0*u.x;
// tu10 += t1*u.x;
// tu01 += t0*u.y;
// tu20 += t2*u.x;
// tu11 += t1*u.y;
// tu02 += t0*u.z;
// tu30 += t3*u.x;
// tu21 += t2*u.y;
// tu12 += t1*u.z;
// tu03 += t0*u.w;
// }
// tuv100_1 += tu10_1*v.x;
// tuv010_1 += tu01_1*v.x;
// tuv001_1 += tu00_1*v.y;
// tuv200_1 += tu20_1*v.x;
// tuv020_1 += tu02_1*v.x;
// tuv002_1 += tu00_1*v.z;
// tuv110_1 += tu11_1*v.x;
// tuv101_1 += tu10_1*v.y;
// tuv011_1 += tu01_1*v.y;
// tuv100_2 += tu10_2*v.x;
// tuv010_2 += tu01_2*v.x;
// tuv001_2 += tu00_2*v.y;
// tuv200_2 += tu20_2*v.x;
// tuv020_2 += tu02_2*v.x;
// tuv002_2 += tu00_2*v.z;
// tuv110_2 += tu11_2*v.x;
// tuv101_2 += tu10_2*v.y;
// tuv011_2 += tu01_2*v.y;
// tuv000 += tu00*v.x;
// tuv100 += tu10*v.x;
// tuv010 += tu01*v.x;
// tuv001 += tu00*v.y;
// tuv200 += tu20*v.x;
// tuv020 += tu02*v.x;
// tuv002 += tu00*v.z;
// tuv110 += tu11*v.x;
// tuv101 += tu10*v.y;
// tuv011 += tu01*v.y;
// tuv300 += tu30*v.x;
// tuv030 += tu03*v.x;
// tuv003 += tu00*v.w;
// tuv210 += tu21*v.x;
// tuv201 += tu20*v.y;
// tuv120 += tu12*v.x;
// tuv021 += tu02*v.y;
// tuv102 += tu10*v.z;
// tuv012 += tu01*v.z;
// tuv111 += tu11*v.y;
// }
// phid[10*m] = 0;
// phid[10*m+1] = tuv100_1;
// phid[10*m+2] = tuv010_1;
// phid[10*m+3] = tuv001_1;
// phid[10*m+4] = tuv200_1;
// phid[10*m+5] = tuv020_1;
// phid[10*m+6] = tuv002_1;
// phid[10*m+7] = tuv110_1;
// phid[10*m+8] = tuv101_1;
// phid[10*m+9] = tuv011_1;
//
// phip[10*m] = 0;
// phip[10*m+1] = tuv100_2;
// phip[10*m+2] = tuv010_2;
// phip[10*m+3] = tuv001_2;
// phip[10*m+4] = tuv200_2;
// phip[10*m+5] = tuv020_2;
// phip[10*m+6] = tuv002_2;
// phip[10*m+7] = tuv110_2;
// phip[10*m+8] = tuv101_2;
// phip[10*m+9] = tuv011_2;
//
// phidp[20*m] = tuv000;
// phidp[20*m+1] = tuv100;
// phidp[20*m+2] = tuv010;
// phidp[20*m+3] = tuv001;
// phidp[20*m+4] = tuv200;
// phidp[20*m+5] = tuv020;
// phidp[20*m+6] = tuv002;
// phidp[20*m+7] = tuv110;
// phidp[20*m+8] = tuv101;
// phidp[20*m+9] = tuv011;
// phidp[20*m+10] = tuv300;
// phidp[20*m+11] = tuv030;
// phidp[20*m+12] = tuv003;
// phidp[20*m+13] = tuv210;
// phidp[20*m+14] = tuv201;
// phidp[20*m+15] = tuv120;
// phidp[20*m+16] = tuv021;
// phidp[20*m+17] = tuv102;
// phidp[20*m+18] = tuv012;
// phidp[20*m+19] = tuv111;
// }
//}
extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phid,
real* __restrict__ phip, real* __restrict__ phidp, const int4* __restrict__ igrid, const real4* __restrict__ theta1,
const real4* __restrict__ theta2, const real4* __restrict__ theta3, real4 invPeriodicBoxSize) {
// extract the induced dipole field at each site
for (int m = blockIdx.x*blockDim.x+threadIdx.x; m < NUM_ATOMS; m += blockDim.x*gridDim.x) {
int4 gridPoint = igrid[m];
real tuv100_1 = 0;
real tuv010_1 = 0;
real tuv001_1 = 0;
real tuv200_1 = 0;
real tuv020_1 = 0;
real tuv002_1 = 0;
real tuv110_1 = 0;
real tuv101_1 = 0;
real tuv011_1 = 0;
real tuv100_2 = 0;
real tuv010_2 = 0;
real tuv001_2 = 0;
real tuv200_2 = 0;
real tuv020_2 = 0;
real tuv002_2 = 0;
real tuv110_2 = 0;
real tuv101_2 = 0;
real tuv011_2 = 0;
real tuv000 = 0;
real tuv001 = 0;
real tuv010 = 0;
real tuv100 = 0;
real tuv200 = 0;
real tuv020 = 0;
real tuv002 = 0;
real tuv110 = 0;
real tuv101 = 0;
real tuv011 = 0;
real tuv300 = 0;
real tuv030 = 0;
real tuv003 = 0;
real tuv210 = 0;
real tuv201 = 0;
real tuv120 = 0;
real tuv021 = 0;
real tuv102 = 0;
real tuv012 = 0;
real tuv111 = 0;
for (int iz = 0; iz < PME_ORDER; iz++) {
int k = gridPoint.z+iz-(gridPoint.z+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
real4 v = theta3[m*PME_ORDER+iz];
real tu00_1 = 0;
real tu01_1 = 0;
real tu10_1 = 0;
real tu20_1 = 0;
real tu11_1 = 0;
real tu02_1 = 0;
real tu00_2 = 0;
real tu01_2 = 0;
real tu10_2 = 0;
real tu20_2 = 0;
real tu11_2 = 0;
real tu02_2 = 0;
real tu00 = 0;
real tu10 = 0;
real tu01 = 0;
real tu20 = 0;
real tu11 = 0;
real tu02 = 0;
real tu30 = 0;
real tu21 = 0;
real tu12 = 0;
real tu03 = 0;
for (int iy = 0; iy < PME_ORDER; iy++) {
int j = gridPoint.y+iy-(gridPoint.y+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
real4 u = theta2[m*PME_ORDER+iy];
real t0_1 = 0;
real t1_1 = 0;
real t2_1 = 0;
real t0_2 = 0;
real t1_2 = 0;
real t2_2 = 0;
real t3 = 0;
for (int ix = 0; ix < PME_ORDER; ix++) {
int i = gridPoint.x+ix-(gridPoint.x+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k;
real2 tq = pmeGrid[gridIndex];
real4 tadd = theta1[m*PME_ORDER+ix];
t0_1 += tq.x*tadd.x;
t1_1 += tq.x*tadd.y;
t2_1 += tq.x*tadd.z;
t0_2 += tq.y*tadd.x;
t1_2 += tq.y*tadd.y;
t2_2 += tq.y*tadd.z;
t3 += (tq.x+tq.y)*tadd.w;
}
tu00_1 += t0_1*u.x;
tu10_1 += t1_1*u.x;
tu01_1 += t0_1*u.y;
tu20_1 += t2_1*u.x;
tu11_1 += t1_1*u.y;
tu02_1 += t0_1*u.z;
tu00_2 += t0_2*u.x;
tu10_2 += t1_2*u.x;
tu01_2 += t0_2*u.y;
tu20_2 += t2_2*u.x;
tu11_2 += t1_2*u.y;
tu02_2 += t0_2*u.z;
real t0 = t0_1 + t0_2;
real t1 = t1_1 + t1_2;
real t2 = t2_1 + t2_2;
tu00 += t0*u.x;
tu10 += t1*u.x;
tu01 += t0*u.y;
tu20 += t2*u.x;
tu11 += t1*u.y;
tu02 += t0*u.z;
tu30 += t3*u.x;
tu21 += t2*u.y;
tu12 += t1*u.z;
tu03 += t0*u.w;
}
tuv100_1 += tu10_1*v.x;
tuv010_1 += tu01_1*v.x;
tuv001_1 += tu00_1*v.y;
tuv200_1 += tu20_1*v.x;
tuv020_1 += tu02_1*v.x;
tuv002_1 += tu00_1*v.z;
tuv110_1 += tu11_1*v.x;
tuv101_1 += tu10_1*v.y;
tuv011_1 += tu01_1*v.y;
tuv100_2 += tu10_2*v.x;
tuv010_2 += tu01_2*v.x;
tuv001_2 += tu00_2*v.y;
tuv200_2 += tu20_2*v.x;
tuv020_2 += tu02_2*v.x;
tuv002_2 += tu00_2*v.z;
tuv110_2 += tu11_2*v.x;
tuv101_2 += tu10_2*v.y;
tuv011_2 += tu01_2*v.y;
tuv000 += tu00*v.x;
tuv100 += tu10*v.x;
tuv010 += tu01*v.x;
tuv001 += tu00*v.y;
tuv200 += tu20*v.x;
tuv020 += tu02*v.x;
tuv002 += tu00*v.z;
tuv110 += tu11*v.x;
tuv101 += tu10*v.y;
tuv011 += tu01*v.y;
tuv300 += tu30*v.x;
tuv030 += tu03*v.x;
tuv003 += tu00*v.w;
tuv210 += tu21*v.x;
tuv201 += tu20*v.y;
tuv120 += tu12*v.x;
tuv021 += tu02*v.y;
tuv102 += tu10*v.z;
tuv012 += tu01*v.z;
tuv111 += tu11*v.y;
}
phid[10*m] = 0;
phid[10*m+1] = tuv100_1;
phid[10*m+2] = tuv010_1;
phid[10*m+3] = tuv001_1;
phid[10*m+4] = tuv200_1;
phid[10*m+5] = tuv020_1;
phid[10*m+6] = tuv002_1;
phid[10*m+7] = tuv110_1;
phid[10*m+8] = tuv101_1;
phid[10*m+9] = tuv011_1;
phip[10*m] = 0;
phip[10*m+1] = tuv100_2;
phip[10*m+2] = tuv010_2;
phip[10*m+3] = tuv001_2;
phip[10*m+4] = tuv200_2;
phip[10*m+5] = tuv020_2;
phip[10*m+6] = tuv002_2;
phip[10*m+7] = tuv110_2;
phip[10*m+8] = tuv101_2;
phip[10*m+9] = tuv011_2;
phidp[20*m] = tuv000;
phidp[20*m+1] = tuv100;
phidp[20*m+2] = tuv010;
phidp[20*m+3] = tuv001;
phidp[20*m+4] = tuv200;
phidp[20*m+5] = tuv020;
phidp[20*m+6] = tuv002;
phidp[20*m+7] = tuv110;
phidp[20*m+8] = tuv101;
phidp[20*m+9] = tuv011;
phidp[20*m+10] = tuv300;
phidp[20*m+11] = tuv030;
phidp[20*m+12] = tuv003;
phidp[20*m+13] = tuv210;
phidp[20*m+14] = tuv201;
phidp[20*m+15] = tuv120;
phidp[20*m+16] = tuv021;
phidp[20*m+17] = tuv102;
phidp[20*m+18] = tuv012;
phidp[20*m+19] = tuv111;
}
}
extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers,
long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer, const real* __restrict__ labFrameDipole,
const real* __restrict__ labFrameQuadrupole, const real* __restrict__ phi, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
const real* __restrict__ labFrameQuadrupole, const real* __restrict__ phi_global, real4 invPeriodicBoxSize) {
real multipole[10];
const int deriv1[] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19};
const int deriv2[] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16};
......@@ -700,22 +709,22 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
multipole[8] = 2*labFrameQuadrupole[i*5+2];
multipole[9] = 2*labFrameQuadrupole[i*5+4];
const real* atomPhi = &phi[20*i];
const real* phi = &phi_global[20*i];
torqueBuffers[i] = (long long) (EPSILON_FACTOR*(multipole[3]*yscale*atomPhi[2] - multipole[2]*zscale*atomPhi[3]
+ 2*(multipole[6]-multipole[5])*yscale*zscale*atomPhi[9]
+ multipole[8]*xscale*yscale*atomPhi[7] + multipole[9]*yscale*yscale*atomPhi[5]
- multipole[7]*xscale*zscale*atomPhi[8] - multipole[9]*zscale*zscale*atomPhi[6])*0xFFFFFFFF);
torqueBuffers[i] = (long long) (EPSILON_FACTOR*(multipole[3]*yscale*phi[2] - multipole[2]*zscale*phi[3]
+ 2*(multipole[6]-multipole[5])*yscale*zscale*phi[9]
+ multipole[8]*xscale*yscale*phi[7] + multipole[9]*yscale*yscale*phi[5]
- multipole[7]*xscale*zscale*phi[8] - multipole[9]*zscale*zscale*phi[6])*0xFFFFFFFF);
torqueBuffers[i+PADDED_NUM_ATOMS] = (long long) (EPSILON_FACTOR*(multipole[1]*zscale*atomPhi[3] - multipole[3]*xscale*atomPhi[1]
+ 2*(multipole[4]-multipole[6])*xscale*zscale*atomPhi[8]
+ multipole[7]*yscale*zscale*atomPhi[9] + multipole[8]*zscale*zscale*atomPhi[6]
- multipole[8]*xscale*xscale*atomPhi[4] - multipole[9]*xscale*yscale*atomPhi[7])*0xFFFFFFFF);
torqueBuffers[i+PADDED_NUM_ATOMS] = (long long) (EPSILON_FACTOR*(multipole[1]*zscale*phi[3] - multipole[3]*xscale*phi[1]
+ 2*(multipole[4]-multipole[6])*xscale*zscale*phi[8]
+ multipole[7]*yscale*zscale*phi[9] + multipole[8]*zscale*zscale*phi[6]
- multipole[8]*xscale*xscale*phi[4] - multipole[9]*xscale*yscale*phi[7])*0xFFFFFFFF);
torqueBuffers[i+PADDED_NUM_ATOMS*2] = (long long) (EPSILON_FACTOR*(multipole[2]*xscale*atomPhi[1] - multipole[1]*yscale*atomPhi[2]
+ 2*(multipole[5]-multipole[4])*xscale*yscale*atomPhi[7]
+ multipole[7]*xscale*xscale*atomPhi[4] + multipole[9]*xscale*zscale*atomPhi[8]
- multipole[7]*yscale*yscale*atomPhi[5] - multipole[8]*yscale*zscale*atomPhi[9])*0xFFFFFFFF);
torqueBuffers[i+PADDED_NUM_ATOMS*2] = (long long) (EPSILON_FACTOR*(multipole[2]*xscale*phi[1] - multipole[1]*yscale*phi[2]
+ 2*(multipole[5]-multipole[4])*xscale*yscale*phi[7]
+ multipole[7]*xscale*xscale*phi[4] + multipole[9]*xscale*zscale*phi[8]
- multipole[7]*yscale*yscale*phi[5] - multipole[8]*yscale*zscale*phi[9])*0xFFFFFFFF);
// Compute the force and energy.
......@@ -731,10 +740,10 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
real4 f = make_real4(0, 0, 0, 0);
for (int k = 0; k < 10; k++) {
energy += multipole[k]*atomPhi[k];
f.x += multipole[k]*atomPhi[deriv1[k]];
f.y += multipole[k]*atomPhi[deriv2[k]];
f.z += multipole[k]*atomPhi[deriv3[k]];
energy += multipole[k]*phi[k];
f.x += multipole[k]*phi[deriv1[k]];
f.y += multipole[k]*phi[deriv2[k]];
f.z += multipole[k]*phi[deriv3[k]];
}
f.x *= EPSILON_FACTOR*xscale;
f.y *= EPSILON_FACTOR*yscale;
......@@ -746,141 +755,125 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*EPSILON_FACTOR*energy;
}
//extern "C" __global__ void kComputeInducedDipoleForceAndEnergy_kernel() {
// real multipole[10];
// real inducedDipole[3];
// real inducedDipolePolar[3];
// real scales[3];
// const int deriv1[] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19};
// const int deriv2[] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16};
// const int deriv3[] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18};
// const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
// const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
// const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
// scales[0] = xscale;
// scales[1] = yscale;
// scales[2] = zscale;
// real energy = 0;
// for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
// // Compute the torque.
//
// multipole[0] = posq[i].w;
// multipole[1] = labFrameDipole[i*3];
// multipole[2] = labFrameDipole[i*3+1];
// multipole[3] = labFrameDipole[i*3+2];
// multipole[4] = labFrameQuadrupole[i*5];
// multipole[5] = labFrameQuadrupole[i*5+3];
// multipole[6] = -(multipole[4]+multipole[5]);
// multipole[7] = 2*labFrameQuadrupole[i*5+1];
// multipole[8] = 2*labFrameQuadrupole[i*5+2];
// multipole[9] = 2*labFrameQuadrupole[i*5+4];
// real* phidp = &cAmoebaSim.pPhidp[20*i];
//
// cAmoebaSim.pTorque[3*i] += 0.5f*EPSILON_FACTOR*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3]
// + 2*(multipole[6]-multipole[5])*yscale*zscale*phidp[9]
// + multipole[8]*xscale*yscale*phidp[7] + multipole[9]*yscale*yscale*phidp[5]
// - multipole[7]*xscale*zscale*phidp[8] - multipole[9]*zscale*zscale*phidp[6]);
//
// cAmoebaSim.pTorque[3*i+1] += 0.5f*EPSILON_FACTOR*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1]
// + 2*(multipole[4]-multipole[6])*xscale*zscale*phidp[8]
// + multipole[7]*yscale*zscale*phidp[9] + multipole[8]*zscale*zscale*phidp[6]
// - multipole[8]*xscale*xscale*phidp[4] - multipole[9]*xscale*yscale*phidp[7]);
//
// cAmoebaSim.pTorque[3*i+2] += 0.5f*EPSILON_FACTOR*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2]
// + 2*(multipole[5]-multipole[4])*xscale*yscale*phidp[7]
// + multipole[7]*xscale*xscale*phidp[4] + multipole[9]*xscale*zscale*phidp[8]
// - multipole[7]*yscale*yscale*phidp[5] - multipole[8]*yscale*zscale*phidp[9]);
//
// // Compute the force and energy.
//
// multipole[1] *= xscale;
// multipole[2] *= yscale;
// multipole[3] *= zscale;
// multipole[4] *= xscale*xscale;
// multipole[5] *= yscale*yscale;
// multipole[6] *= zscale*zscale;
// multipole[7] *= xscale*yscale;
// multipole[8] *= xscale*zscale;
// multipole[9] *= yscale*zscale;
//
// inducedDipole[0] = cAmoebaSim.pInducedDipole[i*3];
// inducedDipole[1] = cAmoebaSim.pInducedDipole[i*3+1];
// inducedDipole[2] = cAmoebaSim.pInducedDipole[i*3+2];
// inducedDipolePolar[0] = cAmoebaSim.pInducedDipolePolar[i*3];
// inducedDipolePolar[1] = cAmoebaSim.pInducedDipolePolar[i*3+1];
// inducedDipolePolar[2] = cAmoebaSim.pInducedDipolePolar[i*3+2];
// real* phi = &cAmoebaSim.pPhi[20*i];
// real* phip = &cAmoebaSim.pPhip[10*i];
// real* phid = &cAmoebaSim.pPhid[10*i];
// real4 f = make_real4(0, 0, 0, 0);
//
// energy += GRID_SIZE_X*invPeriodicBoxSize.x*inducedDipole[0]*phi[1];
// energy += GRID_SIZE_Y*invPeriodicBoxSize.y*inducedDipole[1]*phi[2];
// energy += GRID_SIZE_Z*invPeriodicBoxSize.z*inducedDipole[2]*phi[3];
//
// for (int k = 0; k < 3; k++) {
//
// int j1 = deriv1[k+1];
// int j2 = deriv2[k+1];
// int j3 = deriv3[k+1];
//
// f.x += (inducedDipole[k]+inducedDipolePolar[k])*phi[j1]*(scales[k]/xscale);
// f.y += (inducedDipole[k]+inducedDipolePolar[k])*phi[j2]*(scales[k]/yscale);
// f.z += (inducedDipole[k]+inducedDipolePolar[k])*phi[j3]*(scales[k]/zscale);
//
// if( cAmoebaSim.polarizationType == 0 )
// {
// f.x += (inducedDipole[k]*phip[j1] + inducedDipolePolar[k]*phid[j1])*(scales[k]/xscale);
// f.y += (inducedDipole[k]*phip[j2] + inducedDipolePolar[k]*phid[j2])*(scales[k]/yscale);
// f.z += (inducedDipole[k]*phip[j3] + inducedDipolePolar[k]*phid[j3])*(scales[k]/zscale);
// }
//
//
// }
//
// f.x *= GRID_SIZE_X*invPeriodicBoxSize.x;
// f.y *= GRID_SIZE_Y*invPeriodicBoxSize.y;
// f.z *= GRID_SIZE_Z*invPeriodicBoxSize.z;
// for (int k = 0; k < 10; k++) {
// f.x += multipole[k]*phidp[deriv1[k]];
// f.y += multipole[k]*phidp[deriv2[k]];
// f.z += multipole[k]*phidp[deriv3[k]];
// }
//
// f.x *= 0.5f*EPSILON_FACTOR*xscale;
// f.y *= 0.5f*EPSILON_FACTOR*yscale;
// f.z *= 0.5f*EPSILON_FACTOR*zscale;
//
// real4 force = cSim.pForce4[i];
// force.x -= f.x;
// force.y -= f.y;
// force.z -= f.z;
// cSim.pForce4[i] = force;
// }
// cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*EPSILON_FACTOR*energy;
//}
//
//extern "C" __global__ void kRecordFixedMultipoleField_kernel(real* output) {
// const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
// const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
// const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
// for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
// output[3*i] = -xscale*cAmoebaSim.pPhi[20*i+1];
// output[3*i+1] = -yscale*cAmoebaSim.pPhi[20*i+2];
// output[3*i+2] = -zscale*cAmoebaSim.pPhi[20*i+3];
// }
//}
//
//extern "C" __global__ void kRecordInducedDipoleField_kernel(real* output, real* outputPolar) {
// const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
// const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
// const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
// for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
// output[3*i] -= xscale*cAmoebaSim.pPhid[10*i+1];
// output[3*i+1] -= yscale*cAmoebaSim.pPhid[10*i+2];
// output[3*i+2] -= zscale*cAmoebaSim.pPhid[10*i+3];
// outputPolar[3*i] -= xscale*cAmoebaSim.pPhip[10*i+1];
// outputPolar[3*i+1] -= yscale*cAmoebaSim.pPhip[10*i+2];
// outputPolar[3*i+2] -= zscale*cAmoebaSim.pPhip[10*i+3];
// }
//}
extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers,
long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer, const real* __restrict__ labFrameDipole,
const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole_global, const real* __restrict__ inducedDipolePolar_global,
const real* __restrict__ phi_global, const real* __restrict__ phid_global, const real* __restrict__ phip_global,
const real* __restrict__ phidp_global, real4 invPeriodicBoxSize) {
real multipole[10];
real inducedDipole[3];
real inducedDipolePolar[3];
real scales[3];
const int deriv1[] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19};
const int deriv2[] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16};
const int deriv3[] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18};
const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
scales[0] = xscale;
scales[1] = yscale;
scales[2] = zscale;
real energy = 0;
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
// Compute the torque.
multipole[0] = posq[i].w;
multipole[1] = labFrameDipole[i*3];
multipole[2] = labFrameDipole[i*3+1];
multipole[3] = labFrameDipole[i*3+2];
multipole[4] = labFrameQuadrupole[i*5];
multipole[5] = labFrameQuadrupole[i*5+3];
multipole[6] = -(multipole[4]+multipole[5]);
multipole[7] = 2*labFrameQuadrupole[i*5+1];
multipole[8] = 2*labFrameQuadrupole[i*5+2];
multipole[9] = 2*labFrameQuadrupole[i*5+4];
const real* phidp = &phidp_global[20*i];
torqueBuffers[i] += (long long) (0.5f*EPSILON_FACTOR*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3]
+ 2*(multipole[6]-multipole[5])*yscale*zscale*phidp[9]
+ multipole[8]*xscale*yscale*phidp[7] + multipole[9]*yscale*yscale*phidp[5]
- multipole[7]*xscale*zscale*phidp[8] - multipole[9]*zscale*zscale*phidp[6])*0xFFFFFFFF);
torqueBuffers[i+PADDED_NUM_ATOMS] += (long long) (0.5f*EPSILON_FACTOR*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1]
+ 2*(multipole[4]-multipole[6])*xscale*zscale*phidp[8]
+ multipole[7]*yscale*zscale*phidp[9] + multipole[8]*zscale*zscale*phidp[6]
- multipole[8]*xscale*xscale*phidp[4] - multipole[9]*xscale*yscale*phidp[7])*0xFFFFFFFF);
torqueBuffers[i+PADDED_NUM_ATOMS*2] += (long long) (0.5f*EPSILON_FACTOR*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2]
+ 2*(multipole[5]-multipole[4])*xscale*yscale*phidp[7]
+ multipole[7]*xscale*xscale*phidp[4] + multipole[9]*xscale*zscale*phidp[8]
- multipole[7]*yscale*yscale*phidp[5] - multipole[8]*yscale*zscale*phidp[9])*0xFFFFFFFF);
// Compute the force and energy.
multipole[1] *= xscale;
multipole[2] *= yscale;
multipole[3] *= zscale;
multipole[4] *= xscale*xscale;
multipole[5] *= yscale*yscale;
multipole[6] *= zscale*zscale;
multipole[7] *= xscale*yscale;
multipole[8] *= xscale*zscale;
multipole[9] *= yscale*zscale;
inducedDipole[0] = inducedDipole_global[i*3];
inducedDipole[1] = inducedDipole_global[i*3+1];
inducedDipole[2] = inducedDipole_global[i*3+2];
inducedDipolePolar[0] = inducedDipolePolar_global[i*3];
inducedDipolePolar[1] = inducedDipolePolar_global[i*3+1];
inducedDipolePolar[2] = inducedDipolePolar_global[i*3+2];
const real* phi = &phi_global[20*i];
const real* phip = &phip_global[10*i];
const real* phid = &phid_global[10*i];
real4 f = make_real4(0, 0, 0, 0);
energy += GRID_SIZE_X*invPeriodicBoxSize.x*inducedDipole[0]*phi[1];
energy += GRID_SIZE_Y*invPeriodicBoxSize.y*inducedDipole[1]*phi[2];
energy += GRID_SIZE_Z*invPeriodicBoxSize.z*inducedDipole[2]*phi[3];
for (int k = 0; k < 3; k++) {
int j1 = deriv1[k+1];
int j2 = deriv2[k+1];
int j3 = deriv3[k+1];
f.x += (inducedDipole[k]+inducedDipolePolar[k])*phi[j1]*(scales[k]/xscale);
f.y += (inducedDipole[k]+inducedDipolePolar[k])*phi[j2]*(scales[k]/yscale);
f.z += (inducedDipole[k]+inducedDipolePolar[k])*phi[j3]*(scales[k]/zscale);
#ifndef DIRECT_POLARIZATION
f.x += (inducedDipole[k]*phip[j1] + inducedDipolePolar[k]*phid[j1])*(scales[k]/xscale);
f.y += (inducedDipole[k]*phip[j2] + inducedDipolePolar[k]*phid[j2])*(scales[k]/yscale);
f.z += (inducedDipole[k]*phip[j3] + inducedDipolePolar[k]*phid[j3])*(scales[k]/zscale);
#endif
}
f.x *= GRID_SIZE_X*invPeriodicBoxSize.x;
f.y *= GRID_SIZE_Y*invPeriodicBoxSize.y;
f.z *= GRID_SIZE_Z*invPeriodicBoxSize.z;
for (int k = 0; k < 10; k++) {
f.x += multipole[k]*phidp[deriv1[k]];
f.y += multipole[k]*phidp[deriv2[k]];
f.z += multipole[k]*phidp[deriv3[k]];
}
f.x *= 0.5f*EPSILON_FACTOR*xscale;
f.y *= 0.5f*EPSILON_FACTOR*yscale;
f.z *= 0.5f*EPSILON_FACTOR*zscale;
forceBuffers[i] -= static_cast<unsigned long long>((long long) (f.x*0xFFFFFFFF));
forceBuffers[i+PADDED_NUM_ATOMS] -= static_cast<unsigned long long>((long long) (f.y*0xFFFFFFFF));
forceBuffers[i+PADDED_NUM_ATOMS*2] -= static_cast<unsigned long long>((long long) (f.z*0xFFFFFFFF));
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*EPSILON_FACTOR*energy;
}
extern "C" __global__ void recordInducedFieldDipoles(const real* __restrict__ phid, real* const __restrict__ phip,
real* __restrict__ inducedDipole, real* __restrict__ inducedDipolePolar, real4 invPeriodicBoxSize) {
real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
inducedDipole[3*i] -= xscale*phid[10*i+1];
inducedDipole[3*i+1] -= yscale*phid[10*i+2];
inducedDipole[3*i+2] -= zscale*phid[10*i+3];
inducedDipolePolar[3*i] -= xscale*phip[10*i+1];
inducedDipolePolar[3*i+1] -= yscale*phip[10*i+2];
inducedDipolePolar[3*i+2] -= zscale*phip[10*i+3];
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment