Commit 9e34d39e authored by peastman's avatar peastman Committed by GitHub
Browse files

Merge pull request #1583 from peastman/amoebapme

Optimizations to AMOEBA PME
parents e38c6907 47fe6d40
...@@ -822,7 +822,7 @@ CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::stri ...@@ -822,7 +822,7 @@ CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::stri
inducedDipoleFieldGradientGk(NULL), inducedDipoleFieldGradientGkPolar(NULL), extrapolatedDipoleFieldGradient(NULL), extrapolatedDipoleFieldGradientPolar(NULL), inducedDipoleFieldGradientGk(NULL), inducedDipoleFieldGradientGkPolar(NULL), extrapolatedDipoleFieldGradient(NULL), extrapolatedDipoleFieldGradientPolar(NULL),
extrapolatedDipoleFieldGradientGk(NULL), extrapolatedDipoleFieldGradientGkPolar(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL), extrapolatedDipoleFieldGradientGk(NULL), extrapolatedDipoleFieldGradientGkPolar(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
pmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeIgrid(NULL), pmePhi(NULL), pmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeIgrid(NULL), pmePhi(NULL),
pmePhid(NULL), pmePhip(NULL), pmePhidp(NULL), pmeCphi(NULL), pmeAtomGridIndex(NULL), lastPositions(NULL), sort(NULL), gkKernel(NULL) { pmePhid(NULL), pmePhip(NULL), pmePhidp(NULL), pmeCphi(NULL), lastPositions(NULL), sort(NULL), gkKernel(NULL) {
} }
CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() { CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
...@@ -927,8 +927,6 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() { ...@@ -927,8 +927,6 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
delete pmePhidp; delete pmePhidp;
if (pmeCphi != NULL) if (pmeCphi != NULL)
delete pmeCphi; delete pmeCphi;
if (pmeAtomGridIndex != NULL)
delete pmeAtomGridIndex;
if (lastPositions != NULL) if (lastPositions != NULL)
delete lastPositions; delete lastPositions;
if (sort != NULL) if (sort != NULL)
...@@ -1253,7 +1251,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -1253,7 +1251,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
else if (polarizationType == AmoebaMultipoleForce::Extrapolated) else if (polarizationType == AmoebaMultipoleForce::Extrapolated)
pmeDefines["EXTRAPOLATED_POLARIZATION"] = ""; pmeDefines["EXTRAPOLATED_POLARIZATION"] = "";
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipolePme, pmeDefines); CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipolePme, pmeDefines);
pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeTransformMultipolesKernel = cu.getKernel(module, "transformMultipolesToFractionalCoordinates"); pmeTransformMultipolesKernel = cu.getKernel(module, "transformMultipolesToFractionalCoordinates");
pmeTransformPotentialKernel = cu.getKernel(module, "transformPotentialToCartesianCoordinates"); pmeTransformPotentialKernel = cu.getKernel(module, "transformPotentialToCartesianCoordinates");
pmeSpreadFixedMultipolesKernel = cu.getKernel(module, "gridSpreadFixedMultipoles"); pmeSpreadFixedMultipolesKernel = cu.getKernel(module, "gridSpreadFixedMultipoles");
...@@ -1285,7 +1282,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -1285,7 +1282,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
pmePhidp = new CudaArray(cu, 20*numMultipoles, elementSize, "pmePhidp"); pmePhidp = new CudaArray(cu, 20*numMultipoles, elementSize, "pmePhidp");
pmeCphi = new CudaArray(cu, 10*numMultipoles, elementSize, "pmeCphi"); pmeCphi = new CudaArray(cu, 10*numMultipoles, elementSize, "pmeCphi");
pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange"); pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = CudaArray::create<int2>(cu, numMultipoles, "pmeAtomGridIndex");
sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms()); sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
cufftResult result = cufftPlan3d(&fft, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2Z : CUFFT_C2C); cufftResult result = cufftPlan3d(&fft, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2Z : CUFFT_C2C);
if (result != CUFFT_SUCCESS) if (result != CUFFT_SUCCESS)
...@@ -1569,16 +1565,11 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1569,16 +1565,11 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
// Reciprocal space calculation. // Reciprocal space calculation.
unsigned int maxTiles = nb.getInteractingTiles().getSize(); unsigned int maxTiles = nb.getInteractingTiles().getSize();
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),
cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms(), cu.ThreadBlockSize, cu.ThreadBlockSize*PmeOrder*PmeOrder*elementSize);
sort->sort(*pmeAtomGridIndex);
void* pmeTransformMultipolesArgs[] = {&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), void* pmeTransformMultipolesArgs[] = {&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
&fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; &fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeTransformMultipolesKernel, pmeTransformMultipolesArgs, cu.getNumAtoms()); cu.executeKernel(pmeTransformMultipolesKernel, pmeTransformMultipolesArgs, cu.getNumAtoms());
void* pmeSpreadFixedMultipolesArgs[] = {&cu.getPosq().getDevicePointer(), &fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(), void* pmeSpreadFixedMultipolesArgs[] = {&cu.getPosq().getDevicePointer(), &fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(),
&pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), &pmeGrid->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeSpreadFixedMultipolesKernel, pmeSpreadFixedMultipolesArgs, cu.getNumAtoms()); cu.executeKernel(pmeSpreadFixedMultipolesKernel, pmeSpreadFixedMultipolesArgs, cu.getNumAtoms());
void* finishSpreadArgs[] = {&pmeGrid->getDevicePointer()}; void* finishSpreadArgs[] = {&pmeGrid->getDevicePointer()};
...@@ -1590,7 +1581,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1590,7 +1581,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD); cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
void* pmeConvolutionArgs[] = {&pmeGrid->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), void* pmeConvolutionArgs[] = {&pmeGrid->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(),
&pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, cu.getNumAtoms()); cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE); cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
else else
...@@ -1598,7 +1589,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1598,7 +1589,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
void* pmeFixedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhi->getDevicePointer(), &field->getDevicePointer(), void* pmeFixedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhi->getDevicePointer(), &field->getDevicePointer(),
&fieldPolar ->getDevicePointer(), &cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &fieldPolar ->getDevicePointer(), &cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(),
cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()}; recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeFixedPotentialKernel, pmeFixedPotentialArgs, cu.getNumAtoms()); cu.executeKernel(pmeFixedPotentialKernel, pmeFixedPotentialArgs, cu.getNumAtoms());
void* pmeTransformFixedPotentialArgs[] = {&pmePhi->getDevicePointer(), &pmeCphi->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; void* pmeTransformFixedPotentialArgs[] = {&pmePhi->getDevicePointer(), &pmeCphi->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeTransformPotentialKernel, pmeTransformFixedPotentialArgs, cu.getNumAtoms()); cu.executeKernel(pmeTransformPotentialKernel, pmeTransformFixedPotentialArgs, cu.getNumAtoms());
...@@ -1625,7 +1616,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1625,7 +1616,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cu.clearBuffer(*pmeGrid); cu.clearBuffer(*pmeGrid);
void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
&pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), &pmeGrid->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms()); cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
...@@ -1634,15 +1625,14 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1634,15 +1625,14 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD); cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
else else
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD); cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, cu.getNumAtoms()); cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE); cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
else else
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE); cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(), void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
&pmePhidp->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), &pmePhidp->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(),
cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
&pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms()); cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms());
// Iterate until the dipoles converge. // Iterate until the dipoles converge.
...@@ -1771,7 +1761,7 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect ...@@ -1771,7 +1761,7 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect
cu.executeKernel(computeInducedFieldKernel, &computeInducedFieldArgs[0], numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads); cu.executeKernel(computeInducedFieldKernel, &computeInducedFieldArgs[0], numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
cu.clearBuffer(*pmeGrid); cu.clearBuffer(*pmeGrid);
void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
&pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), &pmeGrid->getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms()); cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
if (cu.getUseDoublePrecision()) { if (cu.getUseDoublePrecision()) {
...@@ -1784,15 +1774,14 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect ...@@ -1784,15 +1774,14 @@ void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVect
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD); cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
void* pmeConvolutionArgs[] = {&pmeGrid->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), void* pmeConvolutionArgs[] = {&pmeGrid->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(),
&pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, cu.getNumAtoms()); cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE); cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
else else
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE); cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(), void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
&pmePhidp->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), &pmePhidp->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(),
cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
&pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms()); cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms());
if (polarizationType == AmoebaMultipoleForce::Extrapolated) { if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(), void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
...@@ -1896,11 +1885,11 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) { ...@@ -1896,11 +1885,11 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
void* updateInducedFieldArgs[] = {&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), void* updateInducedFieldArgs[] = {&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
&prevDipoles->getDevicePointer(), &prevDipolesPolar->getDevicePointer(), &diisCoefficients->getDevicePointer(), &numPrev}; &prevDipoles->getDevicePointer(), &prevDipolesPolar->getDevicePointer(), &diisCoefficients->getDevicePointer(), &numPrev};
cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize); cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, 3*cu.getNumAtoms(), 256);
if (gkKernel != NULL) { if (gkKernel != NULL) {
void* updateInducedFieldGkArgs[] = {&gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(), void* updateInducedFieldGkArgs[] = {&gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(),
&prevDipolesGk->getDevicePointer(), &prevDipolesGkPolar->getDevicePointer(), &diisCoefficients->getDevicePointer(), &numPrev}; &prevDipolesGk->getDevicePointer(), &prevDipolesGkPolar->getDevicePointer(), &diisCoefficients->getDevicePointer(), &numPrev};
cu.executeKernel(updateInducedFieldKernel, updateInducedFieldGkArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize); cu.executeKernel(updateInducedFieldKernel, updateInducedFieldGkArgs, 3*cu.getNumAtoms(), 256);
} }
return false; return false;
} }
......
...@@ -465,12 +465,11 @@ private: ...@@ -465,12 +465,11 @@ private:
CudaArray* pmePhidp; CudaArray* pmePhidp;
CudaArray* pmeCphi; CudaArray* pmeCphi;
CudaArray* pmeAtomRange; CudaArray* pmeAtomRange;
CudaArray* pmeAtomGridIndex;
CudaArray* lastPositions; CudaArray* lastPositions;
CudaSort* sort; CudaSort* sort;
cufftHandle fft; cufftHandle fft;
CUfunction computeMomentsKernel, recordInducedDipolesKernel, computeFixedFieldKernel, computeInducedFieldKernel, updateInducedFieldKernel, electrostaticsKernel, mapTorqueKernel; CUfunction computeMomentsKernel, recordInducedDipolesKernel, computeFixedFieldKernel, computeInducedFieldKernel, updateInducedFieldKernel, electrostaticsKernel, mapTorqueKernel;
CUfunction pmeGridIndexKernel, pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel; CUfunction pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel;
CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel; CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel;
CUfunction recordDIISDipolesKernel, buildMatrixKernel; CUfunction recordDIISDipolesKernel, buildMatrixKernel;
CUfunction initExtrapolatedKernel, iterateExtrapolatedKernel, computeExtrapolatedKernel, addExtrapolatedGradientKernel; CUfunction initExtrapolatedKernel, iterateExtrapolatedKernel, computeExtrapolatedKernel, addExtrapolatedGradientKernel;
......
...@@ -69,47 +69,6 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) { ...@@ -69,47 +69,6 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
thetai[i-1] = make_real4(ARRAY(PME_ORDER,i), ARRAY(PME_ORDER-1,i), ARRAY(PME_ORDER-2,i), ARRAY(PME_ORDER-3,i)); thetai[i-1] = make_real4(ARRAY(PME_ORDER,i), ARRAY(PME_ORDER-1,i), ARRAY(PME_ORDER-2,i), ARRAY(PME_ORDER-3,i));
} }
/**
* Compute the index of the grid point each atom is associated with.
*/
extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
real4 pos = posq[i];
pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f);
pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f);
pos -= periodicBoxVecX*floor(pos.x*recipBoxVecX.z+0.5f);
// First axis.
real w = pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x;
real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
int ifr = (int) fr;
int igrid1 = ifr-PME_ORDER+1;
// Second axis.
w = pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y;
fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
int igrid2 = ifr-PME_ORDER+1;
// Third axis.
w = pos.z*recipBoxVecZ.z;
fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
int igrid3 = ifr-PME_ORDER+1;
// Record the grid point.
igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0);
igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0);
igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0);
pmeAtomGridIndex[i] = make_int2(i, igrid1*GRID_SIZE_Y*GRID_SIZE_Z+igrid2*GRID_SIZE_Z+igrid3);
}
}
/** /**
* Convert the fixed multipoles from Cartesian to fractional coordinates. * Convert the fixed multipoles from Cartesian to fractional coordinates.
*/ */
...@@ -209,7 +168,7 @@ extern "C" __global__ void transformPotentialToCartesianCoordinates(const real* ...@@ -209,7 +168,7 @@ extern "C" __global__ void transformPotentialToCartesianCoordinates(const real*
} }
extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ posq, const real* __restrict__ fracDipole, extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ posq, const real* __restrict__ fracDipole,
const real* __restrict__ fracQuadrupole, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex, const real* __restrict__ fracQuadrupole, real2* __restrict__ pmeGrid,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) { real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
#if __CUDA_ARCH__ < 500 #if __CUDA_ARCH__ < 500
real array[PME_ORDER*PME_ORDER]; real array[PME_ORDER*PME_ORDER];
...@@ -300,7 +259,7 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p ...@@ -300,7 +259,7 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
} }
extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ posq, const real* __restrict__ inducedDipole, extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ posq, const real* __restrict__ inducedDipole,
const real* __restrict__ inducedDipolePolar, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex, const real* __restrict__ inducedDipolePolar, real2* __restrict__ pmeGrid,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) { real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
#if __CUDA_ARCH__ < 500 #if __CUDA_ARCH__ < 500
real array[PME_ORDER*PME_ORDER]; real array[PME_ORDER*PME_ORDER];
...@@ -451,7 +410,7 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co ...@@ -451,7 +410,7 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phi, extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phi,
long long* __restrict__ fieldBuffers, long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq, long long* __restrict__ fieldBuffers, long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq,
const real* __restrict__ labFrameDipole, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, const real* __restrict__ labFrameDipole, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, int2* __restrict__ pmeAtomGridIndex) { real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
#if __CUDA_ARCH__ < 500 #if __CUDA_ARCH__ < 500
real array[PME_ORDER*PME_ORDER]; real array[PME_ORDER*PME_ORDER];
#else #else
...@@ -476,11 +435,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict ...@@ -476,11 +435,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
} }
__syncthreads(); __syncthreads();
// Process the atoms in spatially sorted order. This improves cache performance when loading for (int m = blockIdx.x*blockDim.x+threadIdx.x; m < NUM_ATOMS; m += blockDim.x*gridDim.x) {
// the grid values.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int m = pmeAtomGridIndex[i].x;
real4 pos = posq[m]; real4 pos = posq[m];
pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f); pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f);
pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f); pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f);
...@@ -533,9 +488,9 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict ...@@ -533,9 +488,9 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
real tuv102 = 0; real tuv102 = 0;
real tuv012 = 0; real tuv012 = 0;
real tuv111 = 0; real tuv111 = 0;
for (int iz = 0; iz < PME_ORDER; iz++) { for (int ix = 0; ix < PME_ORDER; ix++) {
int k = igrid3+iz-(igrid3+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); int i = igrid1+ix-(igrid1+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
real4 v = theta3[iz]; real4 v = theta1[ix];
real tu00 = 0; real tu00 = 0;
real tu10 = 0; real tu10 = 0;
real tu01 = 0; real tu01 = 0;
...@@ -550,47 +505,47 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict ...@@ -550,47 +505,47 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
int j = igrid2+iy-(igrid2+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0); int j = igrid2+iy-(igrid2+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
real4 u = theta2[iy]; real4 u = theta2[iy];
real4 t = make_real4(0, 0, 0, 0); real4 t = make_real4(0, 0, 0, 0);
for (int ix = 0; ix < PME_ORDER; ix++) { for (int iz = 0; iz < PME_ORDER; iz++) {
int i = igrid1+ix-(igrid1+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0); int k = igrid3+iz-(igrid3+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k; int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k;
real tq = pmeGrid[gridIndex].x; real tq = pmeGrid[gridIndex].x;
real4 tadd = theta1[ix]; real4 tadd = theta3[iz];
t.x += tq*tadd.x; t.x += tq*tadd.x;
t.y += tq*tadd.y; t.y += tq*tadd.y;
t.z += tq*tadd.z; t.z += tq*tadd.z;
t.w += tq*tadd.w; t.w += tq*tadd.w;
} }
tu00 += t.x*u.x; tu00 += u.x*t.x;
tu10 += t.y*u.x; tu10 += u.y*t.x;
tu01 += t.x*u.y; tu01 += u.x*t.y;
tu20 += t.z*u.x; tu20 += u.z*t.x;
tu11 += t.y*u.y; tu11 += u.y*t.y;
tu02 += t.x*u.z; tu02 += u.x*t.z;
tu30 += t.w*u.x; tu30 += u.w*t.x;
tu21 += t.z*u.y; tu21 += u.z*t.y;
tu12 += t.y*u.z; tu12 += u.y*t.z;
tu03 += t.x*u.w; tu03 += u.x*t.w;
} }
tuv000 += tu00*v.x; tuv000 += v.x*tu00;
tuv100 += tu10*v.x; tuv100 += v.y*tu00;
tuv010 += tu01*v.x; tuv010 += v.x*tu10;
tuv001 += tu00*v.y; tuv001 += v.x*tu01;
tuv200 += tu20*v.x; tuv200 += v.z*tu00;
tuv020 += tu02*v.x; tuv020 += v.x*tu20;
tuv002 += tu00*v.z; tuv002 += v.x*tu02;
tuv110 += tu11*v.x; tuv110 += v.y*tu10;
tuv101 += tu10*v.y; tuv101 += v.y*tu01;
tuv011 += tu01*v.y; tuv011 += v.x*tu11;
tuv300 += tu30*v.x; tuv300 += v.w*tu00;
tuv030 += tu03*v.x; tuv030 += v.x*tu30;
tuv003 += tu00*v.w; tuv003 += v.x*tu03;
tuv210 += tu21*v.x; tuv210 += v.z*tu10;
tuv201 += tu20*v.y; tuv201 += v.z*tu01;
tuv120 += tu12*v.x; tuv120 += v.y*tu20;
tuv021 += tu02*v.y; tuv021 += v.x*tu21;
tuv102 += tu10*v.z; tuv102 += v.y*tu02;
tuv012 += tu01*v.z; tuv012 += v.x*tu12;
tuv111 += tu11*v.y; tuv111 += v.y*tu11;
} }
phi[m] = tuv000; phi[m] = tuv000;
phi[m+NUM_ATOMS] = tuv100; phi[m+NUM_ATOMS] = tuv100;
...@@ -628,7 +583,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict ...@@ -628,7 +583,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phid, extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phid,
real* __restrict__ phip, real* __restrict__ phidp, const real4* __restrict__ posq, real* __restrict__ phip, real* __restrict__ phidp, const real4* __restrict__ posq,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real3 recipBoxVecX,
real3 recipBoxVecY, real3 recipBoxVecZ, int2* __restrict__ pmeAtomGridIndex) { real3 recipBoxVecY, real3 recipBoxVecZ) {
#if __CUDA_ARCH__ < 500 #if __CUDA_ARCH__ < 500
real array[PME_ORDER*PME_ORDER]; real array[PME_ORDER*PME_ORDER];
#else #else
...@@ -640,11 +595,7 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri ...@@ -640,11 +595,7 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real4 theta2[PME_ORDER]; real4 theta2[PME_ORDER];
real4 theta3[PME_ORDER]; real4 theta3[PME_ORDER];
// Process the atoms in spatially sorted order. This improves cache performance when loading for (int m = blockIdx.x*blockDim.x+threadIdx.x; m < NUM_ATOMS; m += blockDim.x*gridDim.x) {
// the grid values.
for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
int m = pmeAtomGridIndex[atom].x;
real4 pos = posq[m]; real4 pos = posq[m];
pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f); pos -= periodicBoxVecZ*floor(pos.z*recipBoxVecZ.z+0.5f);
pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f); pos -= periodicBoxVecY*floor(pos.y*recipBoxVecY.z+0.5f);
...@@ -715,9 +666,9 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri ...@@ -715,9 +666,9 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real tuv102 = 0; real tuv102 = 0;
real tuv012 = 0; real tuv012 = 0;
real tuv111 = 0; real tuv111 = 0;
for (int iz = 0; iz < PME_ORDER; iz++) { for (int ix = 0; ix < PME_ORDER; ix++) {
int k = igrid3+iz-(igrid3+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); int i = igrid1+ix-(igrid1+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
real4 v = theta3[iz]; real4 v = theta1[ix];
real tu00_1 = 0; real tu00_1 = 0;
real tu01_1 = 0; real tu01_1 = 0;
real tu10_1 = 0; real tu10_1 = 0;
...@@ -750,11 +701,11 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri ...@@ -750,11 +701,11 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real t1_2 = 0; real t1_2 = 0;
real t2_2 = 0; real t2_2 = 0;
real t3 = 0; real t3 = 0;
for (int ix = 0; ix < PME_ORDER; ix++) { for (int iz = 0; iz < PME_ORDER; iz++) {
int i = igrid1+ix-(igrid1+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0); int k = igrid3+iz-(igrid3+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k; int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k;
real2 tq = pmeGrid[gridIndex]; real2 tq = pmeGrid[gridIndex];
real4 tadd = theta1[ix]; real4 tadd = theta3[iz];
t0_1 += tq.x*tadd.x; t0_1 += tq.x*tadd.x;
t1_1 += tq.x*tadd.y; t1_1 += tq.x*tadd.y;
t2_1 += tq.x*tadd.z; t2_1 += tq.x*tadd.z;
...@@ -763,70 +714,70 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri ...@@ -763,70 +714,70 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
t2_2 += tq.y*tadd.z; t2_2 += tq.y*tadd.z;
t3 += (tq.x+tq.y)*tadd.w; t3 += (tq.x+tq.y)*tadd.w;
} }
tu00_1 += t0_1*u.x; tu00_1 += u.x*t0_1;
tu10_1 += t1_1*u.x; tu10_1 += u.y*t0_1;
tu01_1 += t0_1*u.y; tu01_1 += u.x*t1_1;
tu20_1 += t2_1*u.x; tu20_1 += u.z*t0_1;
tu11_1 += t1_1*u.y; tu11_1 += u.y*t1_1;
tu02_1 += t0_1*u.z; tu02_1 += u.x*t2_1;
tu00_2 += t0_2*u.x; tu00_2 += u.x*t0_2;
tu10_2 += t1_2*u.x; tu10_2 += u.y*t0_2;
tu01_2 += t0_2*u.y; tu01_2 += u.x*t1_2;
tu20_2 += t2_2*u.x; tu20_2 += u.z*t0_2;
tu11_2 += t1_2*u.y; tu11_2 += u.y*t1_2;
tu02_2 += t0_2*u.z; tu02_2 += u.x*t2_2;
real t0 = t0_1 + t0_2; real t0 = t0_1 + t0_2;
real t1 = t1_1 + t1_2; real t1 = t1_1 + t1_2;
real t2 = t2_1 + t2_2; real t2 = t2_1 + t2_2;
tu00 += t0*u.x; tu00 += u.x*t0;
tu10 += t1*u.x; tu10 += u.y*t0;
tu01 += t0*u.y; tu01 += u.x*t1;
tu20 += t2*u.x; tu20 += u.z*t0;
tu11 += t1*u.y; tu11 += u.y*t1;
tu02 += t0*u.z; tu02 += u.x*t2;
tu30 += t3*u.x; tu30 += u.w*t0;
tu21 += t2*u.y; tu21 += u.z*t1;
tu12 += t1*u.z; tu12 += u.y*t2;
tu03 += t0*u.w; tu03 += u.x*t3;
} }
tuv100_1 += tu10_1*v.x; tuv100_1 += v.y*tu00_1;
tuv010_1 += tu01_1*v.x; tuv010_1 += v.x*tu10_1;
tuv001_1 += tu00_1*v.y; tuv001_1 += v.x*tu01_1;
tuv200_1 += tu20_1*v.x; tuv200_1 += v.z*tu00_1;
tuv020_1 += tu02_1*v.x; tuv020_1 += v.x*tu20_1;
tuv002_1 += tu00_1*v.z; tuv002_1 += v.x*tu02_1;
tuv110_1 += tu11_1*v.x; tuv110_1 += v.y*tu10_1;
tuv101_1 += tu10_1*v.y; tuv101_1 += v.y*tu01_1;
tuv011_1 += tu01_1*v.y; tuv011_1 += v.x*tu11_1;
tuv100_2 += tu10_2*v.x; tuv100_2 += v.y*tu00_2;
tuv010_2 += tu01_2*v.x; tuv010_2 += v.x*tu10_2;
tuv001_2 += tu00_2*v.y; tuv001_2 += v.x*tu01_2;
tuv200_2 += tu20_2*v.x; tuv200_2 += v.z*tu00_2;
tuv020_2 += tu02_2*v.x; tuv020_2 += v.x*tu20_2;
tuv002_2 += tu00_2*v.z; tuv002_2 += v.x*tu02_2;
tuv110_2 += tu11_2*v.x; tuv110_2 += v.y*tu10_2;
tuv101_2 += tu10_2*v.y; tuv101_2 += v.y*tu01_2;
tuv011_2 += tu01_2*v.y; tuv011_2 += v.x*tu11_2;
tuv000 += tu00*v.x; tuv000 += v.x*tu00;
tuv100 += tu10*v.x; tuv100 += v.y*tu00;
tuv010 += tu01*v.x; tuv010 += v.x*tu10;
tuv001 += tu00*v.y; tuv001 += v.x*tu01;
tuv200 += tu20*v.x; tuv200 += v.z*tu00;
tuv020 += tu02*v.x; tuv020 += v.x*tu20;
tuv002 += tu00*v.z; tuv002 += v.x*tu02;
tuv110 += tu11*v.x; tuv110 += v.y*tu10;
tuv101 += tu10*v.y; tuv101 += v.y*tu01;
tuv011 += tu01*v.y; tuv011 += v.x*tu11;
tuv300 += tu30*v.x; tuv300 += v.w*tu00;
tuv030 += tu03*v.x; tuv030 += v.x*tu30;
tuv003 += tu00*v.w; tuv003 += v.x*tu03;
tuv210 += tu21*v.x; tuv210 += v.z*tu10;
tuv201 += tu20*v.y; tuv201 += v.z*tu01;
tuv120 += tu12*v.x; tuv120 += v.y*tu20;
tuv021 += tu02*v.y; tuv021 += v.x*tu21;
tuv102 += tu10*v.z; tuv102 += v.y*tu02;
tuv012 += tu01*v.z; tuv012 += v.x*tu12;
tuv111 += tu11*v.y; tuv111 += v.y*tu11;
} }
phid[m] = 0; phid[m] = 0;
phid[m+NUM_ATOMS] = tuv100_1; phid[m+NUM_ATOMS] = tuv100_1;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment