Commit 3b91c945 authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing CUDA implementation of triclinic boxes for AmoebaMultipoleForce

parent c83f2a12
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2008-2013 Stanford University and the Authors. * * Portions copyright (c) 2008-2015 Stanford University and the Authors. *
* Authors: Peter Eastman, Mark Friedrichs * * Authors: Peter Eastman, Mark Friedrichs *
* Contributors: * * Contributors: *
* * * *
...@@ -801,7 +801,7 @@ CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::stri ...@@ -801,7 +801,7 @@ CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::stri
diisCoefficients(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), prevDipoles(NULL), prevDipolesPolar(NULL), prevDipolesGk(NULL), diisCoefficients(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), prevDipoles(NULL), prevDipolesPolar(NULL), prevDipolesGk(NULL),
prevDipolesGkPolar(NULL), prevErrors(NULL), diisMatrix(NULL), polarizability(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL), prevDipolesGkPolar(NULL), prevErrors(NULL), diisMatrix(NULL), polarizability(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
pmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeIgrid(NULL), pmePhi(NULL), pmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeIgrid(NULL), pmePhi(NULL),
pmePhid(NULL), pmePhip(NULL), pmePhidp(NULL), pmeAtomGridIndex(NULL), lastPositions(NULL), sort(NULL), gkKernel(NULL) { pmePhid(NULL), pmePhip(NULL), pmePhidp(NULL), pmeCphi(NULL), pmeAtomGridIndex(NULL), lastPositions(NULL), sort(NULL), gkKernel(NULL) {
} }
CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() { CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
...@@ -876,6 +876,8 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() { ...@@ -876,6 +876,8 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
delete pmePhip; delete pmePhip;
if (pmePhidp != NULL) if (pmePhidp != NULL)
delete pmePhidp; delete pmePhidp;
if (pmeCphi != NULL)
delete pmeCphi;
if (pmeAtomGridIndex != NULL) if (pmeAtomGridIndex != NULL)
delete pmeAtomGridIndex; delete pmeAtomGridIndex;
if (lastPositions != NULL) if (lastPositions != NULL)
...@@ -1192,6 +1194,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -1192,6 +1194,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipolePme, pmeDefines); CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipolePme, pmeDefines);
pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex"); pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeTransformMultipolesKernel = cu.getKernel(module, "transformMultipolesToFractionalCoordinates"); pmeTransformMultipolesKernel = cu.getKernel(module, "transformMultipolesToFractionalCoordinates");
pmeTransformPotentialKernel = cu.getKernel(module, "transformPotentialToCartesianCoordinates");
pmeSpreadFixedMultipolesKernel = cu.getKernel(module, "gridSpreadFixedMultipoles"); pmeSpreadFixedMultipolesKernel = cu.getKernel(module, "gridSpreadFixedMultipoles");
pmeSpreadInducedDipolesKernel = cu.getKernel(module, "gridSpreadInducedDipoles"); pmeSpreadInducedDipolesKernel = cu.getKernel(module, "gridSpreadInducedDipoles");
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge"); pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
...@@ -1219,6 +1222,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -1219,6 +1222,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
pmePhid = new CudaArray(cu, 10*numMultipoles, elementSize, "pmePhid"); pmePhid = new CudaArray(cu, 10*numMultipoles, elementSize, "pmePhid");
pmePhip = new CudaArray(cu, 10*numMultipoles, elementSize, "pmePhip"); pmePhip = new CudaArray(cu, 10*numMultipoles, elementSize, "pmePhip");
pmePhidp = new CudaArray(cu, 20*numMultipoles, elementSize, "pmePhidp"); pmePhidp = new CudaArray(cu, 20*numMultipoles, elementSize, "pmePhidp");
pmeCphi = new CudaArray(cu, 10*numMultipoles, elementSize, "pmeCphi");
pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange"); pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = CudaArray::create<int2>(cu, numMultipoles, "pmeAtomGridIndex"); pmeAtomGridIndex = CudaArray::create<int2>(cu, numMultipoles, "pmeAtomGridIndex");
sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms()); sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
...@@ -1520,14 +1524,16 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1520,14 +1524,16 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
unsigned int maxTiles = nb.getInteractingTiles().getSize(); unsigned int maxTiles = nb.getInteractingTiles().getSize();
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms(), cu.ThreadBlockSize, cu.ThreadBlockSize*PmeOrder*PmeOrder*elementSize); cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms(), cu.ThreadBlockSize, cu.ThreadBlockSize*PmeOrder*PmeOrder*elementSize);
sort->sort(*pmeAtomGridIndex); sort->sort(*pmeAtomGridIndex);
void* pmeTransformMultipolesArgs[] = {&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), void* pmeTransformMultipolesArgs[] = {&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
&fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; &fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeTransformMultipolesKernel, pmeTransformMultipolesArgs, cu.getNumAtoms()); cu.executeKernel(pmeTransformMultipolesKernel, pmeTransformMultipolesArgs, cu.getNumAtoms());
void* pmeSpreadFixedMultipolesArgs[] = {&cu.getPosq().getDevicePointer(), &fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(), void* pmeSpreadFixedMultipolesArgs[] = {&cu.getPosq().getDevicePointer(), &fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(),
&pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; &pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeSpreadFixedMultipolesKernel, pmeSpreadFixedMultipolesArgs, cu.getNumAtoms()); cu.executeKernel(pmeSpreadFixedMultipolesKernel, pmeSpreadFixedMultipolesArgs, cu.getNumAtoms());
void* finishSpreadArgs[] = {&pmeGrid->getDevicePointer()}; void* finishSpreadArgs[] = {&pmeGrid->getDevicePointer()};
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
...@@ -1547,9 +1553,11 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1547,9 +1553,11 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&fieldPolar ->getDevicePointer(), &cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &fieldPolar ->getDevicePointer(), &cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()}; cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeFixedPotentialKernel, pmeFixedPotentialArgs, cu.getNumAtoms()); cu.executeKernel(pmeFixedPotentialKernel, pmeFixedPotentialArgs, cu.getNumAtoms());
void* pmeTransformFixedPotentialArgs[] = {&pmePhi->getDevicePointer(), &pmeCphi->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeTransformPotentialKernel, pmeTransformFixedPotentialArgs, cu.getNumAtoms());
void* pmeFixedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque->getDevicePointer(), void* pmeFixedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque->getDevicePointer(),
&cu.getEnergyBuffer().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
&fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(), &pmePhi->getDevicePointer(), &fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(), &pmePhi->getDevicePointer(), &pmeCphi->getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeFixedForceKernel, pmeFixedForceArgs, cu.getNumAtoms()); cu.executeKernel(pmeFixedForceKernel, pmeFixedForceArgs, cu.getNumAtoms());
...@@ -1570,7 +1578,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1570,7 +1578,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cu.clearBuffer(*pmeGrid); cu.clearBuffer(*pmeGrid);
void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
&pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; &pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms()); cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid->getSize()); cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid->getSize());
...@@ -1634,11 +1643,13 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1634,11 +1643,13 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
&inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()}; &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads); cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
void* pmeTransformInducedPotentialArgs[] = {&pmePhidp->getDevicePointer(), &pmeCphi->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeTransformPotentialKernel, pmeTransformInducedPotentialArgs, cu.getNumAtoms());
void* pmeInducedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque->getDevicePointer(), void* pmeInducedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque->getDevicePointer(),
&cu.getEnergyBuffer().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
&fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(), &fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(),
&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &pmePhi->getDevicePointer(), &pmePhid->getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &pmePhi->getDevicePointer(), &pmePhid->getDevicePointer(),
&pmePhip->getDevicePointer(), &pmePhidp->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; &pmePhip->getDevicePointer(), &pmePhidp->getDevicePointer(), &pmeCphi->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeInducedForceKernel, pmeInducedForceArgs, cu.getNumAtoms()); cu.executeKernel(pmeInducedForceKernel, pmeInducedForceArgs, cu.getNumAtoms());
} }
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2008-2013 Stanford University and the Authors. * * Portions copyright (c) 2008-2015 Stanford University and the Authors. *
* Authors: Mark Friedrichs, Peter Eastman * * Authors: Mark Friedrichs, Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -421,6 +421,7 @@ private: ...@@ -421,6 +421,7 @@ private:
CudaArray* pmePhid; CudaArray* pmePhid;
CudaArray* pmePhip; CudaArray* pmePhip;
CudaArray* pmePhidp; CudaArray* pmePhidp;
CudaArray* pmeCphi;
CudaArray* pmeAtomRange; CudaArray* pmeAtomRange;
CudaArray* pmeAtomGridIndex; CudaArray* pmeAtomGridIndex;
CudaArray* lastPositions; CudaArray* lastPositions;
...@@ -430,7 +431,7 @@ private: ...@@ -430,7 +431,7 @@ private:
CUfunction pmeGridIndexKernel, pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel; CUfunction pmeGridIndexKernel, pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel;
CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel; CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel;
CUfunction recordDIISDipolesKernel, buildMatrixKernel; CUfunction recordDIISDipolesKernel, buildMatrixKernel;
CUfunction pmeTransformMultipolesKernel; CUfunction pmeTransformMultipolesKernel, pmeTransformPotentialKernel;
CudaCalcAmoebaGeneralizedKirkwoodForceKernel* gkKernel; CudaCalcAmoebaGeneralizedKirkwoodForceKernel* gkKernel;
static const int PmeOrder = 5; static const int PmeOrder = 5;
static const int MaxPrevDIISDipoles = 20; static const int MaxPrevDIISDipoles = 20;
......
...@@ -73,12 +73,12 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) { ...@@ -73,12 +73,12 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
* Compute the index of the grid point each atom is associated with. * Compute the index of the grid point each atom is associated with.
*/ */
extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex, extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex,
real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) { real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) { for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
real4 pos = posq[i]; real4 pos = posq[i];
pos.x -= floor(pos.x*recipBoxVecX.x)*periodicBoxSize.x; pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f);
pos.y -= floor(pos.y*recipBoxVecY.y)*periodicBoxSize.y; pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f);
pos.z -= floor(pos.z*recipBoxVecZ.z)*periodicBoxSize.z; pos -= periodicBoxVecX*floor(pos.x*recipBoxVecX.z+0.5f);
// First axis. // First axis.
...@@ -109,6 +109,7 @@ extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int ...@@ -109,6 +109,7 @@ extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int
pmeAtomGridIndex[i] = make_int2(i, igrid1*GRID_SIZE_Y*GRID_SIZE_Z+igrid2*GRID_SIZE_Z+igrid3); pmeAtomGridIndex[i] = make_int2(i, igrid1*GRID_SIZE_Y*GRID_SIZE_Z+igrid2*GRID_SIZE_Z+igrid3);
} }
} }
/** /**
* Convert the fixed multipoles from Cartesian to fractional coordinates. * Convert the fixed multipoles from Cartesian to fractional coordinates.
*/ */
...@@ -161,9 +162,55 @@ extern "C" __global__ void transformMultipolesToFractionalCoordinates(const real ...@@ -161,9 +162,55 @@ extern "C" __global__ void transformMultipolesToFractionalCoordinates(const real
} }
} }
/**
* Convert the potential from fractional to Cartesian coordinates.
*/
extern "C" __global__ void transformPotentialToCartesianCoordinates(const real* __restrict__ fphi, real* __restrict__ cphi, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
// Build matrices for transforming the potential.
__shared__ real a[3][3];
if (threadIdx.x == 0) {
a[0][0] = GRID_SIZE_X*recipBoxVecX.x;
a[1][0] = GRID_SIZE_X*recipBoxVecY.x;
a[2][0] = GRID_SIZE_X*recipBoxVecZ.x;
a[0][1] = GRID_SIZE_Y*recipBoxVecX.y;
a[1][1] = GRID_SIZE_Y*recipBoxVecY.y;
a[2][1] = GRID_SIZE_Y*recipBoxVecZ.y;
a[0][2] = GRID_SIZE_Z*recipBoxVecX.z;
a[1][2] = GRID_SIZE_Z*recipBoxVecY.z;
a[2][2] = GRID_SIZE_Z*recipBoxVecZ.z;
}
__syncthreads();
int index1[] = {0, 1, 2, 0, 0, 1};
int index2[] = {0, 1, 2, 1, 2, 2};
__shared__ real b[6][6];
if (threadIdx.x < 36) {
int i = threadIdx.x/6;
int j = threadIdx.x-6*i;
b[i][j] = a[index1[i]][index1[j]]*a[index2[i]][index2[j]];
if (index1[i] != index2[i])
b[i][j] += (i < 3 ? b[i][j] : a[index1[i]][index2[j]]*a[index2[i]][index1[j]]);
}
__syncthreads();
// Transform the potential.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
cphi[10*i] = fphi[20*i];
cphi[10*i+1] = a[0][0]*fphi[20*i+1] + a[0][1]*fphi[20*i+2] + a[0][2]*fphi[20*i+3];
cphi[10*i+2] = a[1][0]*fphi[20*i+1] + a[1][1]*fphi[20*i+2] + a[1][2]*fphi[20*i+3];
cphi[10*i+3] = a[2][0]*fphi[20*i+1] + a[2][1]*fphi[20*i+2] + a[2][2]*fphi[20*i+3];
for (int j = 0; j < 6; j++) {
cphi[10*i+4+j] = 0;
for (int k = 0; k < 6; k++)
cphi[10*i+4+j] += b[j][k]*fphi[20*i+4+k];
}
}
}
extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ posq, const real* __restrict__ fracDipole, extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ posq, const real* __restrict__ fracDipole,
const real* __restrict__ fracQuadrupole, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex, const real* __restrict__ fracQuadrupole, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex,
real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) { real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
real array[PME_ORDER*PME_ORDER]; real array[PME_ORDER*PME_ORDER];
real4 theta1[PME_ORDER]; real4 theta1[PME_ORDER];
real4 theta2[PME_ORDER]; real4 theta2[PME_ORDER];
...@@ -175,28 +222,28 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p ...@@ -175,28 +222,28 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) { for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int m = pmeAtomGridIndex[i].x; int m = pmeAtomGridIndex[i].x;
real4 pos = posq[m]; real4 pos = posq[m];
pos.x -= floor(pos.x*recipBoxVecX.x)*periodicBoxSize.x; pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f);
pos.y -= floor(pos.y*recipBoxVecY.y)*periodicBoxSize.y; pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f);
pos.z -= floor(pos.z*recipBoxVecZ.z)*periodicBoxSize.z; pos -= periodicBoxVecX*floor(pos.x*recipBoxVecX.z+0.5f);
// Since we need the full set of thetas, it's faster to compute them here than load them // Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory. // from global memory.
real w = pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x; real w = pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x;
real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f); real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
int ifr = (int) fr; int ifr = (int) floor(fr);
w = fr - ifr; w = fr - ifr;
int igrid1 = ifr-PME_ORDER+1; int igrid1 = ifr-PME_ORDER+1;
computeBSplinePoint(theta1, w, array); computeBSplinePoint(theta1, w, array);
w = pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y; w = pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y;
fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f); fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr; ifr = (int) floor(fr);
w = fr - ifr; w = fr - ifr;
int igrid2 = ifr-PME_ORDER+1; int igrid2 = ifr-PME_ORDER+1;
computeBSplinePoint(theta2, w, array); computeBSplinePoint(theta2, w, array);
w = pos.z*recipBoxVecZ.z; w = pos.z*recipBoxVecZ.z;
fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f); fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr; ifr = (int) floor(fr);
w = fr - ifr; w = fr - ifr;
int igrid3 = ifr-PME_ORDER+1; int igrid3 = ifr-PME_ORDER+1;
computeBSplinePoint(theta3, w, array); computeBSplinePoint(theta3, w, array);
...@@ -252,14 +299,24 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p ...@@ -252,14 +299,24 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ posq, const real* __restrict__ inducedDipole, extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ posq, const real* __restrict__ inducedDipole,
const real* __restrict__ inducedDipolePolar, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex, const real* __restrict__ inducedDipolePolar, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex,
real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) { real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
const real xscale = GRID_SIZE_X*recipBoxVecX.x;
const real yscale = GRID_SIZE_Y*recipBoxVecY.y;
const real zscale = GRID_SIZE_Z*recipBoxVecZ.z;
real array[PME_ORDER*PME_ORDER]; real array[PME_ORDER*PME_ORDER];
real4 theta1[PME_ORDER]; real4 theta1[PME_ORDER];
real4 theta2[PME_ORDER]; real4 theta2[PME_ORDER];
real4 theta3[PME_ORDER]; real4 theta3[PME_ORDER];
__shared__ real cartToFrac[3][3];
if (threadIdx.x == 0) {
cartToFrac[0][0] = GRID_SIZE_X*recipBoxVecX.x;
cartToFrac[0][1] = GRID_SIZE_X*recipBoxVecY.x;
cartToFrac[0][2] = GRID_SIZE_X*recipBoxVecZ.x;
cartToFrac[1][0] = GRID_SIZE_Y*recipBoxVecX.y;
cartToFrac[1][1] = GRID_SIZE_Y*recipBoxVecY.y;
cartToFrac[1][2] = GRID_SIZE_Y*recipBoxVecZ.y;
cartToFrac[2][0] = GRID_SIZE_Z*recipBoxVecX.z;
cartToFrac[2][1] = GRID_SIZE_Z*recipBoxVecY.z;
cartToFrac[2][2] = GRID_SIZE_Z*recipBoxVecZ.z;
}
__syncthreads();
// Process the atoms in spatially sorted order. This improves cache performance when loading // Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values. // the grid values.
...@@ -267,28 +324,28 @@ extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ po ...@@ -267,28 +324,28 @@ extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ po
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) { for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int m = pmeAtomGridIndex[i].x; int m = pmeAtomGridIndex[i].x;
real4 pos = posq[m]; real4 pos = posq[m];
pos.x -= floor(pos.x*recipBoxVecX.x)*periodicBoxSize.x; pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f);
pos.y -= floor(pos.y*recipBoxVecY.y)*periodicBoxSize.y; pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f);
pos.z -= floor(pos.z*recipBoxVecZ.z)*periodicBoxSize.z; pos -= periodicBoxVecX*floor(pos.x*recipBoxVecX.z+0.5f);
// Since we need the full set of thetas, it's faster to compute them here than load them // Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory. // from global memory.
real w = pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x; real w = pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x;
real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f); real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
int ifr = (int) fr; int ifr = (int) floor(fr);
w = fr - ifr; w = fr - ifr;
int igrid1 = ifr-PME_ORDER+1; int igrid1 = ifr-PME_ORDER+1;
computeBSplinePoint(theta1, w, array); computeBSplinePoint(theta1, w, array);
w = pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y; w = pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y;
fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f); fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr; ifr = (int) floor(fr);
w = fr - ifr; w = fr - ifr;
int igrid2 = ifr-PME_ORDER+1; int igrid2 = ifr-PME_ORDER+1;
computeBSplinePoint(theta2, w, array); computeBSplinePoint(theta2, w, array);
w = pos.z*recipBoxVecZ.z; w = pos.z*recipBoxVecZ.z;
fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f); fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr; ifr = (int) floor(fr);
w = fr - ifr; w = fr - ifr;
int igrid3 = ifr-PME_ORDER+1; int igrid3 = ifr-PME_ORDER+1;
computeBSplinePoint(theta3, w, array); computeBSplinePoint(theta3, w, array);
...@@ -316,16 +373,18 @@ extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ po ...@@ -316,16 +373,18 @@ extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ po
int index = ybase + zindex; int index = ybase + zindex;
real4 v = theta3[iz]; real4 v = theta3[iz];
real inducedDipoleX = xscale*inducedDipole[m*3]; real3 cinducedDipole = make_real3(inducedDipole[m*3], inducedDipole[m*3+1], inducedDipole[m*3+2]);
real inducedDipoleY = yscale*inducedDipole[m*3+1]; real3 cinducedDipolePolar = make_real3(inducedDipolePolar[m*3], inducedDipolePolar[m*3+1], inducedDipolePolar[m*3+2]);
real inducedDipoleZ = zscale*inducedDipole[m*3+2]; real3 finducedDipole = make_real3(cinducedDipole.x*cartToFrac[0][0] + cinducedDipole.y*cartToFrac[0][1] + cinducedDipole.z*cartToFrac[0][2],
real inducedDipolePolarX = xscale*inducedDipolePolar[m*3]; cinducedDipole.x*cartToFrac[1][0] + cinducedDipole.y*cartToFrac[1][1] + cinducedDipole.z*cartToFrac[1][2],
real inducedDipolePolarY = yscale*inducedDipolePolar[m*3+1]; cinducedDipole.x*cartToFrac[2][0] + cinducedDipole.y*cartToFrac[2][1] + cinducedDipole.z*cartToFrac[2][2]);
real inducedDipolePolarZ = zscale*inducedDipolePolar[m*3+2]; real3 finducedDipolePolar = make_real3(cinducedDipolePolar.x*cartToFrac[0][0] + cinducedDipolePolar.y*cartToFrac[0][1] + cinducedDipolePolar.z*cartToFrac[0][2],
real term01 = inducedDipoleY*u.y*v.x + inducedDipoleZ*u.x*v.y; cinducedDipolePolar.x*cartToFrac[1][0] + cinducedDipolePolar.y*cartToFrac[1][1] + cinducedDipolePolar.z*cartToFrac[1][2],
real term11 = inducedDipoleX*u.x*v.x; cinducedDipolePolar.x*cartToFrac[2][0] + cinducedDipolePolar.y*cartToFrac[2][1] + cinducedDipolePolar.z*cartToFrac[2][2]);
real term02 = inducedDipolePolarY*u.y*v.x + inducedDipolePolarZ*u.x*v.y; real term01 = finducedDipole.y*u.y*v.x + finducedDipole.z*u.x*v.y;
real term12 = inducedDipolePolarX*u.x*v.x; real term11 = finducedDipole.x*u.x*v.x;
real term02 = finducedDipolePolar.y*u.y*v.x + finducedDipolePolar.z*u.x*v.y;
real term12 = finducedDipolePolar.x*u.x*v.x;
real add1 = term01*t.x + term11*t.y; real add1 = term01*t.x + term11*t.y;
real add2 = term02*t.x + term12*t.y; real add2 = term02*t.x + term12*t.y;
#ifdef USE_DOUBLE_PRECISION #ifdef USE_DOUBLE_PRECISION
...@@ -392,6 +451,19 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict ...@@ -392,6 +451,19 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
real4 theta1[PME_ORDER]; real4 theta1[PME_ORDER];
real4 theta2[PME_ORDER]; real4 theta2[PME_ORDER];
real4 theta3[PME_ORDER]; real4 theta3[PME_ORDER];
__shared__ real fracToCart[3][3];
if (threadIdx.x == 0) {
fracToCart[0][0] = GRID_SIZE_X*recipBoxVecX.x;
fracToCart[1][0] = GRID_SIZE_X*recipBoxVecY.x;
fracToCart[2][0] = GRID_SIZE_X*recipBoxVecZ.x;
fracToCart[0][1] = GRID_SIZE_Y*recipBoxVecX.y;
fracToCart[1][1] = GRID_SIZE_Y*recipBoxVecY.y;
fracToCart[2][1] = GRID_SIZE_Y*recipBoxVecZ.y;
fracToCart[0][2] = GRID_SIZE_Z*recipBoxVecX.z;
fracToCart[1][2] = GRID_SIZE_Z*recipBoxVecY.z;
fracToCart[2][2] = GRID_SIZE_Z*recipBoxVecZ.z;
}
__syncthreads();
// Process the atoms in spatially sorted order. This improves cache performance when loading // Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values. // the grid values.
...@@ -530,13 +602,13 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict ...@@ -530,13 +602,13 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
phi[20*m+18] = tuv012; phi[20*m+18] = tuv012;
phi[20*m+19] = tuv111; phi[20*m+19] = tuv111;
real dipoleScale = (4/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI; real dipoleScale = (4/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI;
long long fieldx = (long long) ((dipoleScale*labFrameDipole[m*3]-GRID_SIZE_X*recipBoxVecX.x*tuv100)*0x100000000); long long fieldx = (long long) ((dipoleScale*labFrameDipole[m*3]-tuv100*fracToCart[0][0]-tuv010*fracToCart[0][1]-tuv001*fracToCart[0][2])*0x100000000);
fieldBuffers[m] = fieldx; fieldBuffers[m] = fieldx;
fieldPolarBuffers[m] = fieldx; fieldPolarBuffers[m] = fieldx;
long long fieldy = (long long) ((dipoleScale*labFrameDipole[m*3+1]-GRID_SIZE_Y*recipBoxVecY.y*tuv010)*0x100000000); long long fieldy = (long long) ((dipoleScale*labFrameDipole[m*3+1]-tuv100*fracToCart[1][0]-tuv010*fracToCart[1][1]-tuv001*fracToCart[1][2])*0x100000000);
fieldBuffers[m+PADDED_NUM_ATOMS] = fieldy; fieldBuffers[m+PADDED_NUM_ATOMS] = fieldy;
fieldPolarBuffers[m+PADDED_NUM_ATOMS] = fieldy; fieldPolarBuffers[m+PADDED_NUM_ATOMS] = fieldy;
long long fieldz = (long long) ((dipoleScale*labFrameDipole[m*3+2]-GRID_SIZE_Z*recipBoxVecZ.z*tuv001)*0x100000000); long long fieldz = (long long) ((dipoleScale*labFrameDipole[m*3+2]-tuv100*fracToCart[2][0]-tuv010*fracToCart[2][1]-tuv001*fracToCart[2][2])*0x100000000);
fieldBuffers[m+2*PADDED_NUM_ATOMS] = fieldz; fieldBuffers[m+2*PADDED_NUM_ATOMS] = fieldz;
fieldPolarBuffers[m+2*PADDED_NUM_ATOMS] = fieldz; fieldPolarBuffers[m+2*PADDED_NUM_ATOMS] = fieldz;
} }
...@@ -786,14 +858,11 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri ...@@ -786,14 +858,11 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers,
long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer, const real* __restrict__ labFrameDipole, long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer, const real* __restrict__ labFrameDipole,
const real* __restrict__ labFrameQuadrupole, const real* __restrict__ fracDipole, const real* __restrict__ fracQuadrupole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ fracDipole, const real* __restrict__ fracQuadrupole,
const real* __restrict__ phi_global, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) { const real* __restrict__ phi_global, const real* __restrict__ cphi_global, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
real multipole[10]; real multipole[10];
const int deriv1[] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19}; const int deriv1[] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19};
const int deriv2[] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16}; const int deriv2[] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16};
const int deriv3[] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18}; const int deriv3[] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18};
const real xscale = GRID_SIZE_X*recipBoxVecX.x;
const real yscale = GRID_SIZE_Y*recipBoxVecY.y;
const real zscale = GRID_SIZE_Z*recipBoxVecZ.z;
real energy = 0; real energy = 0;
__shared__ real fracToCart[3][3]; __shared__ real fracToCart[3][3];
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
...@@ -822,22 +891,22 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict ...@@ -822,22 +891,22 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
multipole[8] = 2*labFrameQuadrupole[i*5+2]; multipole[8] = 2*labFrameQuadrupole[i*5+2];
multipole[9] = 2*labFrameQuadrupole[i*5+4]; multipole[9] = 2*labFrameQuadrupole[i*5+4];
const real* phi = &phi_global[20*i]; const real* cphi = &cphi_global[10*i];
torqueBuffers[i] = (long long) (EPSILON_FACTOR*(multipole[3]*yscale*phi[2] - multipole[2]*zscale*phi[3] torqueBuffers[i] = (long long) (EPSILON_FACTOR*(multipole[3]*cphi[2] - multipole[2]*cphi[3]
+ 2*(multipole[6]-multipole[5])*yscale*zscale*phi[9] + 2*(multipole[6]-multipole[5])*cphi[9]
+ multipole[8]*xscale*yscale*phi[7] + multipole[9]*yscale*yscale*phi[5] + multipole[8]*cphi[7] + multipole[9]*cphi[5]
- multipole[7]*xscale*zscale*phi[8] - multipole[9]*zscale*zscale*phi[6])*0x100000000); - multipole[7]*cphi[8] - multipole[9]*cphi[6])*0x100000000);
torqueBuffers[i+PADDED_NUM_ATOMS] = (long long) (EPSILON_FACTOR*(multipole[1]*zscale*phi[3] - multipole[3]*xscale*phi[1] torqueBuffers[i+PADDED_NUM_ATOMS] = (long long) (EPSILON_FACTOR*(multipole[1]*cphi[3] - multipole[3]*cphi[1]
+ 2*(multipole[4]-multipole[6])*xscale*zscale*phi[8] + 2*(multipole[4]-multipole[6])*cphi[8]
+ multipole[7]*yscale*zscale*phi[9] + multipole[8]*zscale*zscale*phi[6] + multipole[7]*cphi[9] + multipole[8]*cphi[6]
- multipole[8]*xscale*xscale*phi[4] - multipole[9]*xscale*yscale*phi[7])*0x100000000); - multipole[8]*cphi[4] - multipole[9]*cphi[7])*0x100000000);
torqueBuffers[i+PADDED_NUM_ATOMS*2] = (long long) (EPSILON_FACTOR*(multipole[2]*xscale*phi[1] - multipole[1]*yscale*phi[2] torqueBuffers[i+PADDED_NUM_ATOMS*2] = (long long) (EPSILON_FACTOR*(multipole[2]*cphi[1] - multipole[1]*cphi[2]
+ 2*(multipole[5]-multipole[4])*xscale*yscale*phi[7] + 2*(multipole[5]-multipole[4])*cphi[7]
+ multipole[7]*xscale*xscale*phi[4] + multipole[9]*xscale*zscale*phi[8] + multipole[7]*cphi[4] + multipole[9]*cphi[8]
- multipole[7]*yscale*yscale*phi[5] - multipole[8]*yscale*zscale*phi[9])*0x100000000); - multipole[7]*cphi[5] - multipole[8]*cphi[9])*0x100000000);
// Compute the force and energy. // Compute the force and energy.
...@@ -851,6 +920,7 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict ...@@ -851,6 +920,7 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
multipole[8] = fracQuadrupole[i*6+2]; multipole[8] = fracQuadrupole[i*6+2];
multipole[9] = fracQuadrupole[i*6+4]; multipole[9] = fracQuadrupole[i*6+4];
const real* phi = &phi_global[20*i];
real4 f = make_real4(0, 0, 0, 0); real4 f = make_real4(0, 0, 0, 0);
for (int k = 0; k < 10; k++) { for (int k = 0; k < 10; k++) {
energy += multipole[k]*phi[k]; energy += multipole[k]*phi[k];
...@@ -873,20 +943,13 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_ ...@@ -873,20 +943,13 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
const real* __restrict__ labFrameQuadrupole, const real* __restrict__ fracDipole, const real* __restrict__ fracQuadrupole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ fracDipole, const real* __restrict__ fracQuadrupole,
const real* __restrict__ inducedDipole_global, const real* __restrict__ inducedDipolePolar_global, const real* __restrict__ inducedDipole_global, const real* __restrict__ inducedDipolePolar_global,
const real* __restrict__ phi_global, const real* __restrict__ phid_global, const real* __restrict__ phip_global, const real* __restrict__ phi_global, const real* __restrict__ phid_global, const real* __restrict__ phip_global,
const real* __restrict__ phidp_global, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) { const real* __restrict__ phidp_global, const real* __restrict__ cphi_global, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
real multipole[10]; real multipole[10];
real cinducedDipole[3], inducedDipole[3]; real cinducedDipole[3], inducedDipole[3];
real cinducedDipolePolar[3], inducedDipolePolar[3]; real cinducedDipolePolar[3], inducedDipolePolar[3];
real scales[3];
const int deriv1[] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19}; const int deriv1[] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19};
const int deriv2[] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16}; const int deriv2[] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16};
const int deriv3[] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18}; const int deriv3[] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18};
const real xscale = GRID_SIZE_X*recipBoxVecX.x;
const real yscale = GRID_SIZE_Y*recipBoxVecY.y;
const real zscale = GRID_SIZE_Z*recipBoxVecZ.z;
scales[0] = xscale;
scales[1] = yscale;
scales[2] = zscale;
real energy = 0; real energy = 0;
__shared__ real fracToCart[3][3]; __shared__ real fracToCart[3][3];
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
...@@ -914,22 +977,22 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_ ...@@ -914,22 +977,22 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
multipole[7] = 2*labFrameQuadrupole[i*5+1]; multipole[7] = 2*labFrameQuadrupole[i*5+1];
multipole[8] = 2*labFrameQuadrupole[i*5+2]; multipole[8] = 2*labFrameQuadrupole[i*5+2];
multipole[9] = 2*labFrameQuadrupole[i*5+4]; multipole[9] = 2*labFrameQuadrupole[i*5+4];
const real* phidp = &phidp_global[20*i]; const real* cphi = &cphi_global[10*i];
torqueBuffers[i] += (long long) (0.5f*EPSILON_FACTOR*(multipole[3]*yscale*phidp[2] - multipole[2]*zscale*phidp[3] torqueBuffers[i] += (long long) (0.5f*EPSILON_FACTOR*(multipole[3]*cphi[2] - multipole[2]*cphi[3]
+ 2*(multipole[6]-multipole[5])*yscale*zscale*phidp[9] + 2*(multipole[6]-multipole[5])*cphi[9]
+ multipole[8]*xscale*yscale*phidp[7] + multipole[9]*yscale*yscale*phidp[5] + multipole[8]*cphi[7] + multipole[9]*cphi[5]
- multipole[7]*xscale*zscale*phidp[8] - multipole[9]*zscale*zscale*phidp[6])*0x100000000); - multipole[7]*cphi[8] - multipole[9]*cphi[6])*0x100000000);
torqueBuffers[i+PADDED_NUM_ATOMS] += (long long) (0.5f*EPSILON_FACTOR*(multipole[1]*zscale*phidp[3] - multipole[3]*xscale*phidp[1] torqueBuffers[i+PADDED_NUM_ATOMS] += (long long) (0.5f*EPSILON_FACTOR*(multipole[1]*cphi[3] - multipole[3]*cphi[1]
+ 2*(multipole[4]-multipole[6])*xscale*zscale*phidp[8] + 2*(multipole[4]-multipole[6])*cphi[8]
+ multipole[7]*yscale*zscale*phidp[9] + multipole[8]*zscale*zscale*phidp[6] + multipole[7]*cphi[9] + multipole[8]*cphi[6]
- multipole[8]*xscale*xscale*phidp[4] - multipole[9]*xscale*yscale*phidp[7])*0x100000000); - multipole[8]*cphi[4] - multipole[9]*cphi[7])*0x100000000);
torqueBuffers[i+PADDED_NUM_ATOMS*2] += (long long) (0.5f*EPSILON_FACTOR*(multipole[2]*xscale*phidp[1] - multipole[1]*yscale*phidp[2] torqueBuffers[i+PADDED_NUM_ATOMS*2] += (long long) (0.5f*EPSILON_FACTOR*(multipole[2]*cphi[1] - multipole[1]*cphi[2]
+ 2*(multipole[5]-multipole[4])*xscale*yscale*phidp[7] + 2*(multipole[5]-multipole[4])*cphi[7]
+ multipole[7]*xscale*xscale*phidp[4] + multipole[9]*xscale*zscale*phidp[8] + multipole[7]*cphi[4] + multipole[9]*cphi[8]
- multipole[7]*yscale*yscale*phidp[5] - multipole[8]*yscale*zscale*phidp[9])*0x100000000); - multipole[7]*cphi[5] - multipole[8]*cphi[9])*0x100000000);
// Compute the force and energy. // Compute the force and energy.
...@@ -981,6 +1044,7 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_ ...@@ -981,6 +1044,7 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
#endif #endif
} }
const real* phidp = &phidp_global[20*i];
for (int k = 0; k < 10; k++) { for (int k = 0; k < 10; k++) {
f.x += multipole[k]*phidp[deriv1[k]]; f.x += multipole[k]*phidp[deriv1[k]];
f.y += multipole[k]*phidp[deriv2[k]]; f.y += multipole[k]*phidp[deriv2[k]];
...@@ -998,15 +1062,25 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_ ...@@ -998,15 +1062,25 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
extern "C" __global__ void recordInducedFieldDipoles(const real* __restrict__ phid, real* const __restrict__ phip, extern "C" __global__ void recordInducedFieldDipoles(const real* __restrict__ phid, real* const __restrict__ phip,
long long* __restrict__ inducedField, long long* __restrict__ inducedFieldPolar, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) { long long* __restrict__ inducedField, long long* __restrict__ inducedFieldPolar, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
real xscale = GRID_SIZE_X*recipBoxVecX.x*0x100000000; __shared__ real fracToCart[3][3];
real yscale = GRID_SIZE_Y*recipBoxVecY.y*0x100000000; if (threadIdx.x == 0) {
real zscale = GRID_SIZE_Z*recipBoxVecZ.z*0x100000000; fracToCart[0][0] = GRID_SIZE_X*recipBoxVecX.x;
fracToCart[1][0] = GRID_SIZE_X*recipBoxVecY.x;
fracToCart[2][0] = GRID_SIZE_X*recipBoxVecZ.x;
fracToCart[0][1] = GRID_SIZE_Y*recipBoxVecX.y;
fracToCart[1][1] = GRID_SIZE_Y*recipBoxVecY.y;
fracToCart[2][1] = GRID_SIZE_Y*recipBoxVecZ.y;
fracToCart[0][2] = GRID_SIZE_Z*recipBoxVecX.z;
fracToCart[1][2] = GRID_SIZE_Z*recipBoxVecY.z;
fracToCart[2][2] = GRID_SIZE_Z*recipBoxVecZ.z;
}
__syncthreads();
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) { for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
inducedField[i] -= (long long) (xscale*phid[10*i+1]); inducedField[i] -= (long long) (0x100000000*(phid[10*i+1]*fracToCart[0][0] + phid[10*i+2]*fracToCart[0][1] + phid[10*i+3]*fracToCart[0][2]));
inducedField[i+PADDED_NUM_ATOMS] -= (long long) (yscale*phid[10*i+2]); inducedField[i+PADDED_NUM_ATOMS] -= (long long) (0x100000000*(phid[10*i+1]*fracToCart[1][0] + phid[10*i+2]*fracToCart[1][1] + phid[10*i+3]*fracToCart[1][2]));
inducedField[i+PADDED_NUM_ATOMS*2] -= (long long) (zscale*phid[10*i+3]); inducedField[i+PADDED_NUM_ATOMS*2] -= (long long) (0x100000000*(phid[10*i+1]*fracToCart[2][0] + phid[10*i+2]*fracToCart[2][1] + phid[10*i+3]*fracToCart[2][2]));
inducedFieldPolar[i] -= (long long) (xscale*phip[10*i+1]); inducedFieldPolar[i] -= (long long) (0x100000000*(phip[10*i+1]*fracToCart[0][0] + phip[10*i+2]*fracToCart[0][1] + phip[10*i+3]*fracToCart[0][2]));
inducedFieldPolar[i+PADDED_NUM_ATOMS] -= (long long) (yscale*phip[10*i+2]); inducedFieldPolar[i+PADDED_NUM_ATOMS] -= (long long) (0x100000000*(phip[10*i+1]*fracToCart[1][0] + phip[10*i+2]*fracToCart[1][1] + phip[10*i+3]*fracToCart[1][2]));
inducedFieldPolar[i+PADDED_NUM_ATOMS*2] -= (long long) (zscale*phip[10*i+3]); inducedFieldPolar[i+PADDED_NUM_ATOMS*2] -= (long long) (0x100000000*(phip[10*i+1]*fracToCart[2][0] + phip[10*i+2]*fracToCart[2][1] + phip[10*i+3]*fracToCart[2][2]));
} }
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment