Commit c83f2a12 authored by Peter Eastman's avatar Peter Eastman
Browse files

Began CUDA implementation of triclinic boxes for AmoebaMultipoleForce

parent 82e118fb
...@@ -796,8 +796,8 @@ private: ...@@ -796,8 +796,8 @@ private:
CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) :
CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false), hasInitializedFFT(false), multipolesAreValid(false), CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false), hasInitializedFFT(false), multipolesAreValid(false),
multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL), labFrameDipoles(NULL), labFrameQuadrupoles(NULL), multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL), labFrameDipoles(NULL), labFrameQuadrupoles(NULL), fracDipoles(NULL),
field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL), inducedDipole(NULL), fracQuadrupoles(NULL), field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL), inducedDipole(NULL),
diisCoefficients(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), prevDipoles(NULL), prevDipolesPolar(NULL), prevDipolesGk(NULL), diisCoefficients(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), prevDipoles(NULL), prevDipolesPolar(NULL), prevDipolesGk(NULL),
prevDipolesGkPolar(NULL), prevErrors(NULL), diisMatrix(NULL), polarizability(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL), prevDipolesGkPolar(NULL), prevErrors(NULL), diisMatrix(NULL), polarizability(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
pmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeIgrid(NULL), pmePhi(NULL), pmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeIgrid(NULL), pmePhi(NULL),
...@@ -816,6 +816,10 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() { ...@@ -816,6 +816,10 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
delete labFrameDipoles; delete labFrameDipoles;
if (labFrameQuadrupoles != NULL) if (labFrameQuadrupoles != NULL)
delete labFrameQuadrupoles; delete labFrameQuadrupoles;
if (fracDipoles != NULL)
delete fracDipoles;
if (fracQuadrupoles != NULL)
delete fracQuadrupoles;
if (field != NULL) if (field != NULL)
delete field; delete field;
if (fieldPolar != NULL) if (fieldPolar != NULL)
...@@ -968,7 +972,9 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -968,7 +972,9 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
labFrameDipoles = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "labFrameDipoles"); labFrameDipoles = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "labFrameDipoles");
labFrameQuadrupoles = new CudaArray(cu, 9*paddedNumAtoms, elementSize, "labFrameQuadrupoles"); labFrameQuadrupoles = new CudaArray(cu, 5*paddedNumAtoms, elementSize, "labFrameQuadrupoles");
fracDipoles = new CudaArray(cu, 3*paddedNumAtoms, elementSize, "fracDipoles");
fracQuadrupoles = new CudaArray(cu, 6*paddedNumAtoms, elementSize, "fracQuadrupoles");
field = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "field"); field = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "field");
fieldPolar = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "fieldPolar"); fieldPolar = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "fieldPolar");
torque = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "torque"); torque = new CudaArray(cu, 3*paddedNumAtoms, sizeof(long long), "torque");
...@@ -1185,6 +1191,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -1185,6 +1191,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
pmeDefines["DIRECT_POLARIZATION"] = ""; pmeDefines["DIRECT_POLARIZATION"] = "";
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipolePme, pmeDefines); CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipolePme, pmeDefines);
pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex"); pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeTransformMultipolesKernel = cu.getKernel(module, "transformMultipolesToFractionalCoordinates");
pmeSpreadFixedMultipolesKernel = cu.getKernel(module, "gridSpreadFixedMultipoles"); pmeSpreadFixedMultipolesKernel = cu.getKernel(module, "gridSpreadFixedMultipoles");
pmeSpreadInducedDipolesKernel = cu.getKernel(module, "gridSpreadInducedDipoles"); pmeSpreadInducedDipolesKernel = cu.getKernel(module, "gridSpreadInducedDipoles");
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge"); pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
...@@ -1483,15 +1490,44 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1483,15 +1490,44 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
gkKernel->finishComputation(*torque, *labFrameDipoles, *labFrameQuadrupoles, *inducedDipole, *inducedDipolePolar, *dampingAndThole, *covalentFlags, *polarizationGroupFlags); gkKernel->finishComputation(*torque, *labFrameDipoles, *labFrameQuadrupoles, *inducedDipole, *inducedDipolePolar, *dampingAndThole, *covalentFlags, *polarizationGroupFlags);
} }
else { else {
// Compute reciprocal box vectors.
Vec3 boxVectors[3];
cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
double determinant = boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2];
double scale = 1.0/determinant;
double3 recipBoxVectors[3];
recipBoxVectors[0] = make_double3(boxVectors[1][1]*boxVectors[2][2]*scale, 0, 0);
recipBoxVectors[1] = make_double3(-boxVectors[1][0]*boxVectors[2][2]*scale, boxVectors[0][0]*boxVectors[2][2]*scale, 0);
recipBoxVectors[2] = make_double3((boxVectors[1][0]*boxVectors[2][1]-boxVectors[1][1]*boxVectors[2][0])*scale, -boxVectors[0][0]*boxVectors[2][1]*scale, boxVectors[0][0]*boxVectors[1][1]*scale);
float3 recipBoxVectorsFloat[3];
void* recipBoxVectorPointer[3];
if (cu.getUseDoublePrecision()) {
recipBoxVectorPointer[0] = &recipBoxVectors[0];
recipBoxVectorPointer[1] = &recipBoxVectors[1];
recipBoxVectorPointer[2] = &recipBoxVectors[2];
}
else {
recipBoxVectorsFloat[0] = make_float3((float) recipBoxVectors[0].x, 0, 0);
recipBoxVectorsFloat[1] = make_float3((float) recipBoxVectors[1].x, (float) recipBoxVectors[1].y, 0);
recipBoxVectorsFloat[2] = make_float3((float) recipBoxVectors[2].x, (float) recipBoxVectors[2].y, (float) recipBoxVectors[2].z);
recipBoxVectorPointer[0] = &recipBoxVectorsFloat[0];
recipBoxVectorPointer[1] = &recipBoxVectorsFloat[1];
recipBoxVectorPointer[2] = &recipBoxVectorsFloat[2];
}
// Reciprocal space calculation. // Reciprocal space calculation.
unsigned int maxTiles = nb.getInteractingTiles().getSize(); unsigned int maxTiles = nb.getInteractingTiles().getSize();
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms(), cu.ThreadBlockSize, cu.ThreadBlockSize*PmeOrder*PmeOrder*elementSize); cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms(), cu.ThreadBlockSize, cu.ThreadBlockSize*PmeOrder*PmeOrder*elementSize);
sort->sort(*pmeAtomGridIndex); sort->sort(*pmeAtomGridIndex);
void* pmeSpreadFixedMultipolesArgs[] = {&cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), void* pmeTransformMultipolesArgs[] = {&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
&pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; &fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeTransformMultipolesKernel, pmeTransformMultipolesArgs, cu.getNumAtoms());
void* pmeSpreadFixedMultipolesArgs[] = {&cu.getPosq().getDevicePointer(), &fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(),
&pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeSpreadFixedMultipolesKernel, pmeSpreadFixedMultipolesArgs, cu.getNumAtoms()); cu.executeKernel(pmeSpreadFixedMultipolesKernel, pmeSpreadFixedMultipolesArgs, cu.getNumAtoms());
void* finishSpreadArgs[] = {&pmeGrid->getDevicePointer()}; void* finishSpreadArgs[] = {&pmeGrid->getDevicePointer()};
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
...@@ -1501,7 +1537,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1501,7 +1537,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
else else
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD); cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
void* pmeConvolutionArgs[] = {&pmeGrid->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), void* pmeConvolutionArgs[] = {&pmeGrid->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(),
&pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, cu.getNumAtoms()); cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, cu.getNumAtoms());
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE); cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
...@@ -1509,11 +1545,12 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1509,11 +1545,12 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE); cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
void* pmeFixedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhi->getDevicePointer(), &field->getDevicePointer(), void* pmeFixedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhi->getDevicePointer(), &field->getDevicePointer(),
&fieldPolar ->getDevicePointer(), &cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &fieldPolar ->getDevicePointer(), &cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &pmeAtomGridIndex->getDevicePointer()}; cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeFixedPotentialKernel, pmeFixedPotentialArgs, cu.getNumAtoms()); cu.executeKernel(pmeFixedPotentialKernel, pmeFixedPotentialArgs, cu.getNumAtoms());
void* pmeFixedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque->getDevicePointer(), void* pmeFixedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque->getDevicePointer(),
&cu.getEnergyBuffer().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
&pmePhi->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()}; &fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(), &pmePhi->getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeFixedForceKernel, pmeFixedForceArgs, cu.getNumAtoms()); cu.executeKernel(pmeFixedForceKernel, pmeFixedForceArgs, cu.getNumAtoms());
// Direct space calculation. // Direct space calculation.
...@@ -1521,7 +1558,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1521,7 +1558,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
void* computeFixedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(), void* computeFixedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
&covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices, &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
&nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(), &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
&maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(),
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()}; &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads); cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads);
void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
...@@ -1532,7 +1570,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1532,7 +1570,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cu.clearBuffer(*pmeGrid); cu.clearBuffer(*pmeGrid);
void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
&pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; &pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms()); cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid->getSize()); cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid->getSize());
...@@ -1546,7 +1584,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1546,7 +1584,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
else else
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE); cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(), void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
&pmePhidp->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &pmePhidp->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2],
&pmeAtomGridIndex->getDevicePointer()}; &pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms()); cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms());
...@@ -1559,7 +1597,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1559,7 +1597,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(), void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
&nb.getExclusionTiles().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices, &nb.getExclusionTiles().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
&nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(), &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
&maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(),
&dampingAndThole->getDevicePointer()}; &dampingAndThole->getDevicePointer()};
cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads); cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
cu.clearBuffer(*pmeGrid); cu.clearBuffer(*pmeGrid);
...@@ -1577,7 +1616,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1577,7 +1616,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE); cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms()); cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms());
void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(), void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()}; &inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeRecordInducedFieldDipolesKernel, pmeRecordInducedFieldDipolesArgs, cu.getNumAtoms()); cu.executeKernel(pmeRecordInducedFieldDipolesKernel, pmeRecordInducedFieldDipolesArgs, cu.getNumAtoms());
bool converged = iterateDipolesByDIIS(i); bool converged = iterateDipolesByDIIS(i);
if (converged) if (converged)
...@@ -1590,14 +1629,16 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1590,14 +1629,16 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&cu.getPosq().getDevicePointer(), &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &cu.getPosq().getDevicePointer(), &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(),
&nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices, &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
&nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
&maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(),
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
&inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()}; &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads); cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
void* pmeInducedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque->getDevicePointer(), void* pmeInducedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque->getDevicePointer(),
&cu.getEnergyBuffer().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
&fracDipoles->getDevicePointer(), &fracQuadrupoles->getDevicePointer(),
&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &pmePhi->getDevicePointer(), &pmePhid->getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &pmePhi->getDevicePointer(), &pmePhid->getDevicePointer(),
&pmePhip->getDevicePointer(), &pmePhidp->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()}; &pmePhip->getDevicePointer(), &pmePhidp->getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeInducedForceKernel, pmeInducedForceArgs, cu.getNumAtoms()); cu.executeKernel(pmeInducedForceKernel, pmeInducedForceArgs, cu.getNumAtoms());
} }
...@@ -1785,7 +1826,8 @@ void CudaCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl& ...@@ -1785,7 +1826,8 @@ void CudaCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl&
void* computePotentialArgs[] = {&cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(), void* computePotentialArgs[] = {&cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(),
&labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(), &points.getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(), &points.getDevicePointer(),
&potential.getDevicePointer(), &numPoints, cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; &potential.getDevicePointer(), &numPoints, cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(),
cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer()};
int blockSize = 128; int blockSize = 128;
cu.executeKernel(computePotentialKernel, computePotentialArgs, numPoints, blockSize, blockSize*15*elementSize); cu.executeKernel(computePotentialKernel, computePotentialArgs, numPoints, blockSize, blockSize*15*elementSize);
outputElectrostaticPotential.resize(numPoints); outputElectrostaticPotential.resize(numPoints);
......
...@@ -391,6 +391,8 @@ private: ...@@ -391,6 +391,8 @@ private:
CudaArray* molecularQuadrupoles; CudaArray* molecularQuadrupoles;
CudaArray* labFrameDipoles; CudaArray* labFrameDipoles;
CudaArray* labFrameQuadrupoles; CudaArray* labFrameQuadrupoles;
CudaArray* fracDipoles;
CudaArray* fracQuadrupoles;
CudaArray* field; CudaArray* field;
CudaArray* fieldPolar; CudaArray* fieldPolar;
CudaArray* inducedField; CudaArray* inducedField;
...@@ -428,6 +430,7 @@ private: ...@@ -428,6 +430,7 @@ private:
CUfunction pmeGridIndexKernel, pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel; CUfunction pmeGridIndexKernel, pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel;
CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel; CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel;
CUfunction recordDIISDipolesKernel, buildMatrixKernel; CUfunction recordDIISDipolesKernel, buildMatrixKernel;
CUfunction pmeTransformMultipolesKernel;
CudaCalcAmoebaGeneralizedKirkwoodForceKernel* gkKernel; CudaCalcAmoebaGeneralizedKirkwoodForceKernel* gkKernel;
static const int PmeOrder = 5; static const int PmeOrder = 5;
static const int MaxPrevDIISDipoles = 20; static const int MaxPrevDIISDipoles = 20;
......
...@@ -65,7 +65,9 @@ extern "C" __global__ void computeElectrostatics( ...@@ -65,7 +65,9 @@ extern "C" __global__ void computeElectrostatics(
const real4* __restrict__ posq, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, const real4* __restrict__ posq, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags,
const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices, const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms, const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
const unsigned int* __restrict__ interactingAtoms,
#endif #endif
const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole,
const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) { const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
......
...@@ -440,7 +440,9 @@ extern "C" __global__ void computeFixedField( ...@@ -440,7 +440,9 @@ extern "C" __global__ void computeFixedField(
const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, const ushort2* __restrict__ exclusionTiles, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, const ushort2* __restrict__ exclusionTiles,
unsigned int startTileIndex, unsigned int numTileIndices, unsigned int startTileIndex, unsigned int numTileIndices,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms, const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
const unsigned int* __restrict__ interactingAtoms,
#elif defined USE_GK #elif defined USE_GK
const real* __restrict__ bornRadii, unsigned long long* __restrict__ gkFieldBuffers, const real* __restrict__ bornRadii, unsigned long long* __restrict__ gkFieldBuffers,
#endif #endif
...@@ -494,9 +496,7 @@ extern "C" __global__ void computeFixedField( ...@@ -494,9 +496,7 @@ extern "C" __global__ void computeFixedField(
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = trimTo3(localData[tbx+j].posq-data.posq); real3 delta = trimTo3(localData[tbx+j].posq-data.posq);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; APPLY_PERIODIC_TO_DELTA(delta)
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
int atom2 = y*TILE_SIZE+j; int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
...@@ -532,9 +532,7 @@ extern "C" __global__ void computeFixedField( ...@@ -532,9 +532,7 @@ extern "C" __global__ void computeFixedField(
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
real3 delta = trimTo3(localData[tbx+tj].posq-data.posq); real3 delta = trimTo3(localData[tbx+tj].posq-data.posq);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; APPLY_PERIODIC_TO_DELTA(delta)
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
int atom2 = y*TILE_SIZE+tj; int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
...@@ -675,9 +673,7 @@ extern "C" __global__ void computeFixedField( ...@@ -675,9 +673,7 @@ extern "C" __global__ void computeFixedField(
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
real3 delta = trimTo3(localData[tbx+tj].posq-data.posq); real3 delta = trimTo3(localData[tbx+tj].posq-data.posq);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; APPLY_PERIODIC_TO_DELTA(delta)
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
int atom2 = atomIndices[tbx+tj]; int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
......
...@@ -207,7 +207,8 @@ extern "C" __global__ void computeInducedField( ...@@ -207,7 +207,8 @@ extern "C" __global__ void computeInducedField(
unsigned long long* __restrict__ field, unsigned long long* __restrict__ fieldPolar, const real4* __restrict__ posq, const ushort2* __restrict__ exclusionTiles, unsigned long long* __restrict__ field, unsigned long long* __restrict__ fieldPolar, const real4* __restrict__ posq, const ushort2* __restrict__ exclusionTiles,
const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, unsigned int startTileIndex, unsigned int numTileIndices, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, unsigned int startTileIndex, unsigned int numTileIndices,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms, const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
#elif defined USE_GK #elif defined USE_GK
unsigned long long* __restrict__ fieldS, unsigned long long* __restrict__ fieldPolarS, const real* __restrict__ inducedDipoleS, unsigned long long* __restrict__ fieldS, unsigned long long* __restrict__ fieldPolarS, const real* __restrict__ inducedDipoleS,
const real* __restrict__ inducedDipolePolarS, const real* __restrict__ bornRadii, const real* __restrict__ inducedDipolePolarS, const real* __restrict__ bornRadii,
...@@ -251,9 +252,7 @@ extern "C" __global__ void computeInducedField( ...@@ -251,9 +252,7 @@ extern "C" __global__ void computeInducedField(
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = localData[tbx+j].pos-data.pos; real3 delta = localData[tbx+j].pos-data.pos;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; APPLY_PERIODIC_TO_DELTA(delta)
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
int atom2 = y*TILE_SIZE+j; int atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS)
...@@ -273,9 +272,7 @@ extern "C" __global__ void computeInducedField( ...@@ -273,9 +272,7 @@ extern "C" __global__ void computeInducedField(
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = localData[tbx+tj].pos-data.pos; real3 delta = localData[tbx+tj].pos-data.pos;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; APPLY_PERIODIC_TO_DELTA(delta)
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
int atom2 = y*TILE_SIZE+j; int atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS)
...@@ -404,9 +401,7 @@ extern "C" __global__ void computeInducedField( ...@@ -404,9 +401,7 @@ extern "C" __global__ void computeInducedField(
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
real3 delta = localData[tbx+tj].pos-data.pos; real3 delta = localData[tbx+tj].pos-data.pos;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; APPLY_PERIODIC_TO_DELTA(delta)
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
int atom2 = atomIndices[tbx+tj]; int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS)
......
...@@ -73,30 +73,30 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) { ...@@ -73,30 +73,30 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
* Compute the index of the grid point each atom is associated with. * Compute the index of the grid point each atom is associated with.
*/ */
extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex, extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex,
real4 periodicBoxSize, real4 invPeriodicBoxSize) { real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) { for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
real4 pos = posq[i]; real4 pos = posq[i];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; pos.x -= floor(pos.x*recipBoxVecX.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; pos.y -= floor(pos.y*recipBoxVecY.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z; pos.z -= floor(pos.z*recipBoxVecZ.z)*periodicBoxSize.z;
// First axis. // First axis.
real w = pos.x*invPeriodicBoxSize.x; real w = pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x;
real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f); real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
int ifr = (int) fr; int ifr = (int) fr;
int igrid1 = ifr-PME_ORDER+1; int igrid1 = ifr-PME_ORDER+1;
// Second axis. // Second axis.
w = pos.y*invPeriodicBoxSize.y; w = pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y;
fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f); fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr; ifr = (int) fr;
int igrid2 = ifr-PME_ORDER+1; int igrid2 = ifr-PME_ORDER+1;
// Third axis. // Third axis.
w = pos.z*invPeriodicBoxSize.z; w = pos.z*recipBoxVecZ.z;
fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f); fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr; ifr = (int) fr;
int igrid3 = ifr-PME_ORDER+1; int igrid3 = ifr-PME_ORDER+1;
...@@ -109,13 +109,61 @@ extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int ...@@ -109,13 +109,61 @@ extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int
pmeAtomGridIndex[i] = make_int2(i, igrid1*GRID_SIZE_Y*GRID_SIZE_Z+igrid2*GRID_SIZE_Z+igrid3); pmeAtomGridIndex[i] = make_int2(i, igrid1*GRID_SIZE_Y*GRID_SIZE_Z+igrid2*GRID_SIZE_Z+igrid3);
} }
} }
/**
* Convert the fixed multipoles from Cartesian to fractional coordinates.
*/
extern "C" __global__ void transformMultipolesToFractionalCoordinates(const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole,
real* __restrict__ fracDipole, real* __restrict__ fracQuadrupole, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
// Build matrices for transforming the dipoles and quadrupoles.
__shared__ real a[3][3];
if (threadIdx.x == 0) {
a[0][0] = GRID_SIZE_X*recipBoxVecX.x;
a[0][1] = GRID_SIZE_X*recipBoxVecY.x;
a[0][2] = GRID_SIZE_X*recipBoxVecZ.x;
a[1][0] = GRID_SIZE_Y*recipBoxVecX.y;
a[1][1] = GRID_SIZE_Y*recipBoxVecY.y;
a[1][2] = GRID_SIZE_Y*recipBoxVecZ.y;
a[2][0] = GRID_SIZE_Z*recipBoxVecX.z;
a[2][1] = GRID_SIZE_Z*recipBoxVecY.z;
a[2][2] = GRID_SIZE_Z*recipBoxVecZ.z;
}
__syncthreads();
int index1[] = {0, 0, 0, 1, 1, 2};
int index2[] = {0, 1, 2, 1, 2, 2};
__shared__ real b[6][6];
if (threadIdx.x < 36) {
int i = threadIdx.x/6;
int j = threadIdx.x-6*i;
b[i][j] = a[index1[i]][index1[j]]*a[index2[i]][index2[j]];
if (index1[i] != index2[i])
b[i][j] += a[index1[i]][index2[j]]*a[index2[i]][index1[j]];
}
__syncthreads();
extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ posq, const real* __restrict__ labFrameDipole, // Transform the multipoles.
const real* __restrict__ labFrameQuadrupole, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex,
real4 periodicBoxSize, real4 invPeriodicBoxSize) { real quadScale[] = {1, 2, 2, 1, 2, 1};
const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x; for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y; for (int j = 0; j < 3; j++) {
const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z; real dipole = 0;
for (int k = 0; k < 3; k++)
dipole += a[j][k]*labFrameDipole[3*i+k];
fracDipole[3*i+j] = dipole;
}
for (int j = 0; j < 6; j++) {
real quadrupole = 0;
for (int k = 0; k < 5; k++)
quadrupole += quadScale[k]*b[j][k]*labFrameQuadrupole[5*i+k];
quadrupole -= quadScale[5]*b[j][5]*(labFrameQuadrupole[5*i]+labFrameQuadrupole[5*i+3]);
fracQuadrupole[6*i+j] = quadrupole;
}
}
}
extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ posq, const real* __restrict__ fracDipole,
const real* __restrict__ fracQuadrupole, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex,
real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
real array[PME_ORDER*PME_ORDER]; real array[PME_ORDER*PME_ORDER];
real4 theta1[PME_ORDER]; real4 theta1[PME_ORDER];
real4 theta2[PME_ORDER]; real4 theta2[PME_ORDER];
...@@ -127,26 +175,26 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p ...@@ -127,26 +175,26 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) { for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int m = pmeAtomGridIndex[i].x; int m = pmeAtomGridIndex[i].x;
real4 pos = posq[m]; real4 pos = posq[m];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; pos.x -= floor(pos.x*recipBoxVecX.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; pos.y -= floor(pos.y*recipBoxVecY.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z; pos.z -= floor(pos.z*recipBoxVecZ.z)*periodicBoxSize.z;
// Since we need the full set of thetas, it's faster to compute them here than load them // Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory. // from global memory.
real w = pos.x*invPeriodicBoxSize.x; real w = pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x;
real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f); real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
int ifr = (int) fr; int ifr = (int) fr;
w = fr - ifr; w = fr - ifr;
int igrid1 = ifr-PME_ORDER+1; int igrid1 = ifr-PME_ORDER+1;
computeBSplinePoint(theta1, w, array); computeBSplinePoint(theta1, w, array);
w = pos.y*invPeriodicBoxSize.y; w = pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y;
fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f); fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr; ifr = (int) fr;
w = fr - ifr; w = fr - ifr;
int igrid2 = ifr-PME_ORDER+1; int igrid2 = ifr-PME_ORDER+1;
computeBSplinePoint(theta2, w, array); computeBSplinePoint(theta2, w, array);
w = pos.z*invPeriodicBoxSize.z; w = pos.z*recipBoxVecZ.z;
fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f); fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr; ifr = (int) fr;
w = fr - ifr; w = fr - ifr;
...@@ -177,15 +225,15 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p ...@@ -177,15 +225,15 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
real4 v = theta3[iz]; real4 v = theta3[iz];
real atomCharge = pos.w; real atomCharge = pos.w;
real atomDipoleX = xscale*labFrameDipole[m*3]; real atomDipoleX = fracDipole[m*3];
real atomDipoleY = yscale*labFrameDipole[m*3+1]; real atomDipoleY = fracDipole[m*3+1];
real atomDipoleZ = zscale*labFrameDipole[m*3+2]; real atomDipoleZ = fracDipole[m*3+2];
real atomQuadrupoleXX = xscale*xscale*labFrameQuadrupole[m*5]; real atomQuadrupoleXX = fracQuadrupole[m*6];
real atomQuadrupoleXY = 2*xscale*yscale*labFrameQuadrupole[m*5+1]; real atomQuadrupoleXY = fracQuadrupole[m*6+1];
real atomQuadrupoleXZ = 2*xscale*zscale*labFrameQuadrupole[m*5+2]; real atomQuadrupoleXZ = fracQuadrupole[m*6+2];
real atomQuadrupoleYY = yscale*yscale*labFrameQuadrupole[m*5+3]; real atomQuadrupoleYY = fracQuadrupole[m*6+3];
real atomQuadrupoleYZ = 2*yscale*zscale*labFrameQuadrupole[m*5+4]; real atomQuadrupoleYZ = fracQuadrupole[m*6+4];
real atomQuadrupoleZZ = -zscale*zscale*(labFrameQuadrupole[m*5]+labFrameQuadrupole[m*5+3]); real atomQuadrupoleZZ = fracQuadrupole[m*6+5];
real term0 = atomCharge*u.x*v.x + atomDipoleY*u.y*v.x + atomDipoleZ*u.x*v.y + atomQuadrupoleYY*u.z*v.x + atomQuadrupoleZZ*u.x*v.z + atomQuadrupoleYZ*u.y*v.y; real term0 = atomCharge*u.x*v.x + atomDipoleY*u.y*v.x + atomDipoleZ*u.x*v.y + atomQuadrupoleYY*u.z*v.x + atomQuadrupoleZZ*u.x*v.z + atomQuadrupoleYZ*u.y*v.y;
real term1 = atomDipoleX*u.x*v.x + atomQuadrupoleXY*u.y*v.x + atomQuadrupoleXZ*u.x*v.y; real term1 = atomDipoleX*u.x*v.x + atomQuadrupoleXY*u.y*v.x + atomQuadrupoleXZ*u.x*v.y;
real term2 = atomQuadrupoleXX * u.x * v.x; real term2 = atomQuadrupoleXX * u.x * v.x;
...@@ -204,10 +252,10 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p ...@@ -204,10 +252,10 @@ extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ p
extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ posq, const real* __restrict__ inducedDipole, extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ posq, const real* __restrict__ inducedDipole,
const real* __restrict__ inducedDipolePolar, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex, const real* __restrict__ inducedDipolePolar, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex,
real4 periodicBoxSize, real4 invPeriodicBoxSize) { real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x; const real xscale = GRID_SIZE_X*recipBoxVecX.x;
const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y; const real yscale = GRID_SIZE_Y*recipBoxVecY.y;
const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z; const real zscale = GRID_SIZE_Z*recipBoxVecZ.z;
real array[PME_ORDER*PME_ORDER]; real array[PME_ORDER*PME_ORDER];
real4 theta1[PME_ORDER]; real4 theta1[PME_ORDER];
real4 theta2[PME_ORDER]; real4 theta2[PME_ORDER];
...@@ -219,26 +267,26 @@ extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ po ...@@ -219,26 +267,26 @@ extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ po
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) { for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int m = pmeAtomGridIndex[i].x; int m = pmeAtomGridIndex[i].x;
real4 pos = posq[m]; real4 pos = posq[m];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; pos.x -= floor(pos.x*recipBoxVecX.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; pos.y -= floor(pos.y*recipBoxVecY.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z; pos.z -= floor(pos.z*recipBoxVecZ.z)*periodicBoxSize.z;
// Since we need the full set of thetas, it's faster to compute them here than load them // Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory. // from global memory.
real w = pos.x*invPeriodicBoxSize.x; real w = pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x;
real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f); real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
int ifr = (int) fr; int ifr = (int) fr;
w = fr - ifr; w = fr - ifr;
int igrid1 = ifr-PME_ORDER+1; int igrid1 = ifr-PME_ORDER+1;
computeBSplinePoint(theta1, w, array); computeBSplinePoint(theta1, w, array);
w = pos.y*invPeriodicBoxSize.y; w = pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y;
fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f); fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr; ifr = (int) fr;
w = fr - ifr; w = fr - ifr;
int igrid2 = ifr-PME_ORDER+1; int igrid2 = ifr-PME_ORDER+1;
computeBSplinePoint(theta2, w, array); computeBSplinePoint(theta2, w, array);
w = pos.z*invPeriodicBoxSize.z; w = pos.z*recipBoxVecZ.z;
fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f); fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr; ifr = (int) fr;
w = fr - ifr; w = fr - ifr;
...@@ -306,7 +354,8 @@ extern "C" __global__ void finishSpreadCharge(long long* __restrict__ pmeGrid) { ...@@ -306,7 +354,8 @@ extern "C" __global__ void finishSpreadCharge(long long* __restrict__ pmeGrid) {
} }
extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, const real* __restrict__ pmeBsplineModuliX, extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, const real* __restrict__ pmeBsplineModuliX,
const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, real4 periodicBoxSize, real4 invPeriodicBoxSize) { const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, real4 periodicBoxSize,
real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real expFactor = M_PI*M_PI/(EWALD_ALPHA*EWALD_ALPHA); real expFactor = M_PI*M_PI/(EWALD_ALPHA*EWALD_ALPHA);
real scaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z); real scaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
...@@ -322,9 +371,9 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co ...@@ -322,9 +371,9 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X); int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y); int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z); int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
real mhx = mx*invPeriodicBoxSize.x; real mhx = mx*recipBoxVecX.x;
real mhy = my*invPeriodicBoxSize.y; real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
real mhz = mz*invPeriodicBoxSize.z; real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
real bx = pmeBsplineModuliX[kx]; real bx = pmeBsplineModuliX[kx];
real by = pmeBsplineModuliY[ky]; real by = pmeBsplineModuliY[ky];
real bz = pmeBsplineModuliZ[kz]; real bz = pmeBsplineModuliZ[kz];
...@@ -338,7 +387,7 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co ...@@ -338,7 +387,7 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phi, extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phi,
long long* __restrict__ fieldBuffers, long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq, long long* __restrict__ fieldBuffers, long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq,
const real* __restrict__ labFrameDipole, real4 periodicBoxSize, real4 invPeriodicBoxSize, int2* __restrict__ pmeAtomGridIndex) { const real* __restrict__ labFrameDipole, real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, int2* __restrict__ pmeAtomGridIndex) {
real array[PME_ORDER*PME_ORDER]; real array[PME_ORDER*PME_ORDER];
real4 theta1[PME_ORDER]; real4 theta1[PME_ORDER];
real4 theta2[PME_ORDER]; real4 theta2[PME_ORDER];
...@@ -350,26 +399,26 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict ...@@ -350,26 +399,26 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) { for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int m = pmeAtomGridIndex[i].x; int m = pmeAtomGridIndex[i].x;
real4 pos = posq[m]; real4 pos = posq[m];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; pos.x -= floor(pos.x*recipBoxVecX.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; pos.y -= floor(pos.y*recipBoxVecY.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z; pos.z -= floor(pos.z*recipBoxVecZ.z)*periodicBoxSize.z;
// Since we need the full set of thetas, it's faster to compute them here than load them // Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory. // from global memory.
real w = pos.x*invPeriodicBoxSize.x; real w = pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x;
real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f); real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
int ifr = (int) fr; int ifr = (int) fr;
w = fr - ifr; w = fr - ifr;
int igrid1 = ifr-PME_ORDER+1; int igrid1 = ifr-PME_ORDER+1;
computeBSplinePoint(theta1, w, array); computeBSplinePoint(theta1, w, array);
w = pos.y*invPeriodicBoxSize.y; w = pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y;
fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f); fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr; ifr = (int) fr;
w = fr - ifr; w = fr - ifr;
int igrid2 = ifr-PME_ORDER+1; int igrid2 = ifr-PME_ORDER+1;
computeBSplinePoint(theta2, w, array); computeBSplinePoint(theta2, w, array);
w = pos.z*invPeriodicBoxSize.z; w = pos.z*recipBoxVecZ.z;
fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f); fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr; ifr = (int) fr;
w = fr - ifr; w = fr - ifr;
...@@ -481,13 +530,13 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict ...@@ -481,13 +530,13 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
phi[20*m+18] = tuv012; phi[20*m+18] = tuv012;
phi[20*m+19] = tuv111; phi[20*m+19] = tuv111;
real dipoleScale = (4/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI; real dipoleScale = (4/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI;
long long fieldx = (long long) ((dipoleScale*labFrameDipole[m*3]-GRID_SIZE_X*invPeriodicBoxSize.x*tuv100)*0x100000000); long long fieldx = (long long) ((dipoleScale*labFrameDipole[m*3]-GRID_SIZE_X*recipBoxVecX.x*tuv100)*0x100000000);
fieldBuffers[m] = fieldx; fieldBuffers[m] = fieldx;
fieldPolarBuffers[m] = fieldx; fieldPolarBuffers[m] = fieldx;
long long fieldy = (long long) ((dipoleScale*labFrameDipole[m*3+1]-GRID_SIZE_Y*invPeriodicBoxSize.y*tuv010)*0x100000000); long long fieldy = (long long) ((dipoleScale*labFrameDipole[m*3+1]-GRID_SIZE_Y*recipBoxVecY.y*tuv010)*0x100000000);
fieldBuffers[m+PADDED_NUM_ATOMS] = fieldy; fieldBuffers[m+PADDED_NUM_ATOMS] = fieldy;
fieldPolarBuffers[m+PADDED_NUM_ATOMS] = fieldy; fieldPolarBuffers[m+PADDED_NUM_ATOMS] = fieldy;
long long fieldz = (long long) ((dipoleScale*labFrameDipole[m*3+2]-GRID_SIZE_Z*invPeriodicBoxSize.z*tuv001)*0x100000000); long long fieldz = (long long) ((dipoleScale*labFrameDipole[m*3+2]-GRID_SIZE_Z*recipBoxVecZ.z*tuv001)*0x100000000);
fieldBuffers[m+2*PADDED_NUM_ATOMS] = fieldz; fieldBuffers[m+2*PADDED_NUM_ATOMS] = fieldz;
fieldPolarBuffers[m+2*PADDED_NUM_ATOMS] = fieldz; fieldPolarBuffers[m+2*PADDED_NUM_ATOMS] = fieldz;
} }
...@@ -495,7 +544,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict ...@@ -495,7 +544,7 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phid, extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phid,
real* __restrict__ phip, real* __restrict__ phidp, const real4* __restrict__ posq, real* __restrict__ phip, real* __restrict__ phidp, const real4* __restrict__ posq,
real4 periodicBoxSize, real4 invPeriodicBoxSize, int2* __restrict__ pmeAtomGridIndex) { real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ, int2* __restrict__ pmeAtomGridIndex) {
real array[PME_ORDER*PME_ORDER]; real array[PME_ORDER*PME_ORDER];
real4 theta1[PME_ORDER]; real4 theta1[PME_ORDER];
real4 theta2[PME_ORDER]; real4 theta2[PME_ORDER];
...@@ -507,26 +556,26 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri ...@@ -507,26 +556,26 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) { for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int m = pmeAtomGridIndex[i].x; int m = pmeAtomGridIndex[i].x;
real4 pos = posq[m]; real4 pos = posq[m];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; pos.x -= floor(pos.x*recipBoxVecX.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; pos.y -= floor(pos.y*recipBoxVecY.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z; pos.z -= floor(pos.z*recipBoxVecZ.z)*periodicBoxSize.z;
// Since we need the full set of thetas, it's faster to compute them here than load them // Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory. // from global memory.
real w = pos.x*invPeriodicBoxSize.x; real w = pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x;
real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f); real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
int ifr = (int) fr; int ifr = (int) fr;
w = fr - ifr; w = fr - ifr;
int igrid1 = ifr-PME_ORDER+1; int igrid1 = ifr-PME_ORDER+1;
computeBSplinePoint(theta1, w, array); computeBSplinePoint(theta1, w, array);
w = pos.y*invPeriodicBoxSize.y; w = pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y;
fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f); fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr; ifr = (int) fr;
w = fr - ifr; w = fr - ifr;
int igrid2 = ifr-PME_ORDER+1; int igrid2 = ifr-PME_ORDER+1;
computeBSplinePoint(theta2, w, array); computeBSplinePoint(theta2, w, array);
w = pos.z*invPeriodicBoxSize.z; w = pos.z*recipBoxVecZ.z;
fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f); fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr; ifr = (int) fr;
w = fr - ifr; w = fr - ifr;
...@@ -736,15 +785,29 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri ...@@ -736,15 +785,29 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers,
long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer, const real* __restrict__ labFrameDipole, long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer, const real* __restrict__ labFrameDipole,
const real* __restrict__ labFrameQuadrupole, const real* __restrict__ phi_global, real4 invPeriodicBoxSize) { const real* __restrict__ labFrameQuadrupole, const real* __restrict__ fracDipole, const real* __restrict__ fracQuadrupole,
const real* __restrict__ phi_global, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
real multipole[10]; real multipole[10];
const int deriv1[] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19}; const int deriv1[] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19};
const int deriv2[] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16}; const int deriv2[] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16};
const int deriv3[] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18}; const int deriv3[] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18};
const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x; const real xscale = GRID_SIZE_X*recipBoxVecX.x;
const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y; const real yscale = GRID_SIZE_Y*recipBoxVecY.y;
const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z; const real zscale = GRID_SIZE_Z*recipBoxVecZ.z;
real energy = 0; real energy = 0;
__shared__ real fracToCart[3][3];
if (threadIdx.x == 0) {
fracToCart[0][0] = GRID_SIZE_X*recipBoxVecX.x;
fracToCart[1][0] = GRID_SIZE_X*recipBoxVecY.x;
fracToCart[2][0] = GRID_SIZE_X*recipBoxVecZ.x;
fracToCart[0][1] = GRID_SIZE_Y*recipBoxVecX.y;
fracToCart[1][1] = GRID_SIZE_Y*recipBoxVecY.y;
fracToCart[2][1] = GRID_SIZE_Y*recipBoxVecZ.y;
fracToCart[0][2] = GRID_SIZE_Z*recipBoxVecX.z;
fracToCart[1][2] = GRID_SIZE_Z*recipBoxVecY.z;
fracToCart[2][2] = GRID_SIZE_Z*recipBoxVecZ.z;
}
__syncthreads();
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) { for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
// Compute the torque. // Compute the torque.
...@@ -778,15 +841,15 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict ...@@ -778,15 +841,15 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
// Compute the force and energy. // Compute the force and energy.
multipole[1] *= xscale; multipole[1] = fracDipole[i*3];
multipole[2] *= yscale; multipole[2] = fracDipole[i*3+1];
multipole[3] *= zscale; multipole[3] = fracDipole[i*3+2];
multipole[4] *= xscale*xscale; multipole[4] = fracQuadrupole[i*6];
multipole[5] *= yscale*yscale; multipole[5] = fracQuadrupole[i*6+3];
multipole[6] *= zscale*zscale; multipole[6] = fracQuadrupole[i*6+5];
multipole[7] *= xscale*yscale; multipole[7] = fracQuadrupole[i*6+1];
multipole[8] *= xscale*zscale; multipole[8] = fracQuadrupole[i*6+2];
multipole[9] *= yscale*zscale; multipole[9] = fracQuadrupole[i*6+4];
real4 f = make_real4(0, 0, 0, 0); real4 f = make_real4(0, 0, 0, 0);
for (int k = 0; k < 10; k++) { for (int k = 0; k < 10; k++) {
...@@ -795,9 +858,9 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict ...@@ -795,9 +858,9 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
f.y += multipole[k]*phi[deriv2[k]]; f.y += multipole[k]*phi[deriv2[k]];
f.z += multipole[k]*phi[deriv3[k]]; f.z += multipole[k]*phi[deriv3[k]];
} }
f.x *= EPSILON_FACTOR*xscale; f = make_real4(EPSILON_FACTOR*(f.x*fracToCart[0][0] + f.y*fracToCart[0][1] + f.z*fracToCart[0][2]),
f.y *= EPSILON_FACTOR*yscale; EPSILON_FACTOR*(f.x*fracToCart[1][0] + f.y*fracToCart[1][1] + f.z*fracToCart[1][2]),
f.z *= EPSILON_FACTOR*zscale; EPSILON_FACTOR*(f.x*fracToCart[2][0] + f.y*fracToCart[2][1] + f.z*fracToCart[2][2]), 0);
forceBuffers[i] -= static_cast<unsigned long long>((long long) (f.x*0x100000000)); forceBuffers[i] -= static_cast<unsigned long long>((long long) (f.x*0x100000000));
forceBuffers[i+PADDED_NUM_ATOMS] -= static_cast<unsigned long long>((long long) (f.y*0x100000000)); forceBuffers[i+PADDED_NUM_ATOMS] -= static_cast<unsigned long long>((long long) (f.y*0x100000000));
forceBuffers[i+PADDED_NUM_ATOMS*2] -= static_cast<unsigned long long>((long long) (f.z*0x100000000)); forceBuffers[i+PADDED_NUM_ATOMS*2] -= static_cast<unsigned long long>((long long) (f.z*0x100000000));
...@@ -807,23 +870,37 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict ...@@ -807,23 +870,37 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers,
long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer, const real* __restrict__ labFrameDipole, long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer, const real* __restrict__ labFrameDipole,
const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole_global, const real* __restrict__ inducedDipolePolar_global, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ fracDipole, const real* __restrict__ fracQuadrupole,
const real* __restrict__ inducedDipole_global, const real* __restrict__ inducedDipolePolar_global,
const real* __restrict__ phi_global, const real* __restrict__ phid_global, const real* __restrict__ phip_global, const real* __restrict__ phi_global, const real* __restrict__ phid_global, const real* __restrict__ phip_global,
const real* __restrict__ phidp_global, real4 invPeriodicBoxSize) { const real* __restrict__ phidp_global, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
real multipole[10]; real multipole[10];
real inducedDipole[3]; real cinducedDipole[3], inducedDipole[3];
real inducedDipolePolar[3]; real cinducedDipolePolar[3], inducedDipolePolar[3];
real scales[3]; real scales[3];
const int deriv1[] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19}; const int deriv1[] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19};
const int deriv2[] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16}; const int deriv2[] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16};
const int deriv3[] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18}; const int deriv3[] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18};
const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x; const real xscale = GRID_SIZE_X*recipBoxVecX.x;
const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y; const real yscale = GRID_SIZE_Y*recipBoxVecY.y;
const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z; const real zscale = GRID_SIZE_Z*recipBoxVecZ.z;
scales[0] = xscale; scales[0] = xscale;
scales[1] = yscale; scales[1] = yscale;
scales[2] = zscale; scales[2] = zscale;
real energy = 0; real energy = 0;
__shared__ real fracToCart[3][3];
if (threadIdx.x == 0) {
fracToCart[0][0] = GRID_SIZE_X*recipBoxVecX.x;
fracToCart[1][0] = GRID_SIZE_X*recipBoxVecY.x;
fracToCart[2][0] = GRID_SIZE_X*recipBoxVecZ.x;
fracToCart[0][1] = GRID_SIZE_Y*recipBoxVecX.y;
fracToCart[1][1] = GRID_SIZE_Y*recipBoxVecY.y;
fracToCart[2][1] = GRID_SIZE_Y*recipBoxVecZ.y;
fracToCart[0][2] = GRID_SIZE_Z*recipBoxVecX.z;
fracToCart[1][2] = GRID_SIZE_Z*recipBoxVecY.z;
fracToCart[2][2] = GRID_SIZE_Z*recipBoxVecZ.z;
}
__syncthreads();
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) { for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
// Compute the torque. // Compute the torque.
...@@ -856,56 +933,62 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_ ...@@ -856,56 +933,62 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
// Compute the force and energy. // Compute the force and energy.
multipole[1] *= xscale; multipole[1] = fracDipole[i*3];
multipole[2] *= yscale; multipole[2] = fracDipole[i*3+1];
multipole[3] *= zscale; multipole[3] = fracDipole[i*3+2];
multipole[4] *= xscale*xscale; multipole[4] = fracQuadrupole[i*6];
multipole[5] *= yscale*yscale; multipole[5] = fracQuadrupole[i*6+3];
multipole[6] *= zscale*zscale; multipole[6] = fracQuadrupole[i*6+5];
multipole[7] *= xscale*yscale; multipole[7] = fracQuadrupole[i*6+1];
multipole[8] *= xscale*zscale; multipole[8] = fracQuadrupole[i*6+2];
multipole[9] *= yscale*zscale; multipole[9] = fracQuadrupole[i*6+4];
inducedDipole[0] = inducedDipole_global[i*3]; cinducedDipole[0] = inducedDipole_global[i*3];
inducedDipole[1] = inducedDipole_global[i*3+1]; cinducedDipole[1] = inducedDipole_global[i*3+1];
inducedDipole[2] = inducedDipole_global[i*3+2]; cinducedDipole[2] = inducedDipole_global[i*3+2];
inducedDipolePolar[0] = inducedDipolePolar_global[i*3]; cinducedDipolePolar[0] = inducedDipolePolar_global[i*3];
inducedDipolePolar[1] = inducedDipolePolar_global[i*3+1]; cinducedDipolePolar[1] = inducedDipolePolar_global[i*3+1];
inducedDipolePolar[2] = inducedDipolePolar_global[i*3+2]; cinducedDipolePolar[2] = inducedDipolePolar_global[i*3+2];
// Multiply the dipoles by cartToFrac, which is just the transpose of fracToCart.
inducedDipole[0] = cinducedDipole[0]*fracToCart[0][0] + cinducedDipole[1]*fracToCart[1][0] + cinducedDipole[2]*fracToCart[2][0];
inducedDipole[1] = cinducedDipole[0]*fracToCart[0][1] + cinducedDipole[1]*fracToCart[1][1] + cinducedDipole[2]*fracToCart[2][1];
inducedDipole[2] = cinducedDipole[0]*fracToCart[0][2] + cinducedDipole[1]*fracToCart[1][2] + cinducedDipole[2]*fracToCart[2][2];
inducedDipolePolar[0] = cinducedDipolePolar[0]*fracToCart[0][0] + cinducedDipolePolar[1]*fracToCart[1][0] + cinducedDipolePolar[2]*fracToCart[2][0];
inducedDipolePolar[1] = cinducedDipolePolar[0]*fracToCart[0][1] + cinducedDipolePolar[1]*fracToCart[1][1] + cinducedDipolePolar[2]*fracToCart[2][1];
inducedDipolePolar[2] = cinducedDipolePolar[0]*fracToCart[0][2] + cinducedDipolePolar[1]*fracToCart[1][2] + cinducedDipolePolar[2]*fracToCart[2][2];
const real* phi = &phi_global[20*i]; const real* phi = &phi_global[20*i];
const real* phip = &phip_global[10*i]; const real* phip = &phip_global[10*i];
const real* phid = &phid_global[10*i]; const real* phid = &phid_global[10*i];
real4 f = make_real4(0, 0, 0, 0); real4 f = make_real4(0, 0, 0, 0);
energy += GRID_SIZE_X*invPeriodicBoxSize.x*inducedDipole[0]*phi[1]; energy += inducedDipole[0]*phi[1];
energy += GRID_SIZE_Y*invPeriodicBoxSize.y*inducedDipole[1]*phi[2]; energy += inducedDipole[1]*phi[2];
energy += GRID_SIZE_Z*invPeriodicBoxSize.z*inducedDipole[2]*phi[3]; energy += inducedDipole[2]*phi[3];
for (int k = 0; k < 3; k++) { for (int k = 0; k < 3; k++) {
int j1 = deriv1[k+1]; int j1 = deriv1[k+1];
int j2 = deriv2[k+1]; int j2 = deriv2[k+1];
int j3 = deriv3[k+1]; int j3 = deriv3[k+1];
f.x += (inducedDipole[k]+inducedDipolePolar[k])*phi[j1]*(scales[k]/xscale); f.x += (inducedDipole[k]+inducedDipolePolar[k])*phi[j1];
f.y += (inducedDipole[k]+inducedDipolePolar[k])*phi[j2]*(scales[k]/yscale); f.y += (inducedDipole[k]+inducedDipolePolar[k])*phi[j2];
f.z += (inducedDipole[k]+inducedDipolePolar[k])*phi[j3]*(scales[k]/zscale); f.z += (inducedDipole[k]+inducedDipolePolar[k])*phi[j3];
#ifndef DIRECT_POLARIZATION #ifndef DIRECT_POLARIZATION
f.x += (inducedDipole[k]*phip[j1] + inducedDipolePolar[k]*phid[j1])*(scales[k]/xscale); f.x += (inducedDipole[k]*phip[j1] + inducedDipolePolar[k]*phid[j1]);
f.y += (inducedDipole[k]*phip[j2] + inducedDipolePolar[k]*phid[j2])*(scales[k]/yscale); f.y += (inducedDipole[k]*phip[j2] + inducedDipolePolar[k]*phid[j2]);
f.z += (inducedDipole[k]*phip[j3] + inducedDipolePolar[k]*phid[j3])*(scales[k]/zscale); f.z += (inducedDipole[k]*phip[j3] + inducedDipolePolar[k]*phid[j3]);
#endif #endif
} }
f.x *= GRID_SIZE_X*invPeriodicBoxSize.x;
f.y *= GRID_SIZE_Y*invPeriodicBoxSize.y;
f.z *= GRID_SIZE_Z*invPeriodicBoxSize.z;
for (int k = 0; k < 10; k++) { for (int k = 0; k < 10; k++) {
f.x += multipole[k]*phidp[deriv1[k]]; f.x += multipole[k]*phidp[deriv1[k]];
f.y += multipole[k]*phidp[deriv2[k]]; f.y += multipole[k]*phidp[deriv2[k]];
f.z += multipole[k]*phidp[deriv3[k]]; f.z += multipole[k]*phidp[deriv3[k]];
} }
f.x *= 0.5f*EPSILON_FACTOR*xscale; f = make_real4(0.5f*EPSILON_FACTOR*(f.x*fracToCart[0][0] + f.y*fracToCart[0][1] + f.z*fracToCart[0][2]),
f.y *= 0.5f*EPSILON_FACTOR*yscale; 0.5f*EPSILON_FACTOR*(f.x*fracToCart[1][0] + f.y*fracToCart[1][1] + f.z*fracToCart[1][2]),
f.z *= 0.5f*EPSILON_FACTOR*zscale; 0.5f*EPSILON_FACTOR*(f.x*fracToCart[2][0] + f.y*fracToCart[2][1] + f.z*fracToCart[2][2]), 0);
forceBuffers[i] -= static_cast<unsigned long long>((long long) (f.x*0x100000000)); forceBuffers[i] -= static_cast<unsigned long long>((long long) (f.x*0x100000000));
forceBuffers[i+PADDED_NUM_ATOMS] -= static_cast<unsigned long long>((long long) (f.y*0x100000000)); forceBuffers[i+PADDED_NUM_ATOMS] -= static_cast<unsigned long long>((long long) (f.y*0x100000000));
forceBuffers[i+PADDED_NUM_ATOMS*2] -= static_cast<unsigned long long>((long long) (f.z*0x100000000)); forceBuffers[i+PADDED_NUM_ATOMS*2] -= static_cast<unsigned long long>((long long) (f.z*0x100000000));
...@@ -914,10 +997,10 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_ ...@@ -914,10 +997,10 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
} }
extern "C" __global__ void recordInducedFieldDipoles(const real* __restrict__ phid, real* const __restrict__ phip, extern "C" __global__ void recordInducedFieldDipoles(const real* __restrict__ phid, real* const __restrict__ phip,
long long* __restrict__ inducedField, long long* __restrict__ inducedFieldPolar, real4 invPeriodicBoxSize) { long long* __restrict__ inducedField, long long* __restrict__ inducedFieldPolar, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
real xscale = GRID_SIZE_X*invPeriodicBoxSize.x*0x100000000; real xscale = GRID_SIZE_X*recipBoxVecX.x*0x100000000;
real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y*0x100000000; real yscale = GRID_SIZE_Y*recipBoxVecY.y*0x100000000;
real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z*0x100000000; real zscale = GRID_SIZE_Z*recipBoxVecZ.z*0x100000000;
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) { for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
inducedField[i] -= (long long) (xscale*phid[10*i+1]); inducedField[i] -= (long long) (xscale*phid[10*i+1]);
inducedField[i+PADDED_NUM_ATOMS] -= (long long) (yscale*phid[10*i+2]); inducedField[i+PADDED_NUM_ATOMS] -= (long long) (yscale*phid[10*i+2]);
......
...@@ -448,7 +448,7 @@ extern "C" __global__ void mapTorqueToForce(unsigned long long* __restrict__ for ...@@ -448,7 +448,7 @@ extern "C" __global__ void mapTorqueToForce(unsigned long long* __restrict__ for
*/ */
extern "C" __global__ void computePotentialAtPoints(const real4* __restrict__ posq, const real* __restrict__ labFrameDipole, extern "C" __global__ void computePotentialAtPoints(const real4* __restrict__ posq, const real* __restrict__ labFrameDipole,
const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole, const real4* __restrict__ points, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole, const real4* __restrict__ points,
real* __restrict__ potential, int numPoints, real4 periodicBoxSize, real4 invPeriodicBoxSize) { real* __restrict__ potential, int numPoints, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
extern __shared__ real4 localPosq[]; extern __shared__ real4 localPosq[];
real3* localDipole = (real3*) &localPosq[blockDim.x]; real3* localDipole = (real3*) &localPosq[blockDim.x];
real3* localInducedDipole = (real3*) &localDipole[blockDim.x]; real3* localInducedDipole = (real3*) &localDipole[blockDim.x];
...@@ -481,9 +481,7 @@ extern "C" __global__ void computePotentialAtPoints(const real4* __restrict__ po ...@@ -481,9 +481,7 @@ extern "C" __global__ void computePotentialAtPoints(const real4* __restrict__ po
for (int i = 0; i < end; i++) { for (int i = 0; i < end; i++) {
real3 delta = trimTo3(localPosq[i]-pointPos); real3 delta = trimTo3(localPosq[i]-pointPos);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; APPLY_PERIODIC_TO_DELTA(delta)
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
real r2 = dot(delta, delta); real r2 = dot(delta, delta);
real rInv = RSQRT(r2); real rInv = RSQRT(r2);
......
...@@ -65,7 +65,7 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr ...@@ -65,7 +65,7 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
} }
__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool hasExclusions, float dScale, float pScale, float mScale, float forceFactor, __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool hasExclusions, float dScale, float pScale, float mScale, float forceFactor,
real& energy, real4 periodicBoxSize, real4 invPeriodicBoxSize) { real& energy, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
real4 delta; real4 delta;
delta.x = atom2.pos.x - atom1.pos.x; delta.x = atom2.pos.x - atom1.pos.x;
delta.y = atom2.pos.y - atom1.pos.y; delta.y = atom2.pos.y - atom1.pos.y;
...@@ -73,10 +73,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has ...@@ -73,10 +73,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has
// periodic box // periodic box
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; APPLY_PERIODIC_TO_DELTA(delta)
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.w = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; delta.w = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (delta.w > CUTOFF_SQUARED) if (delta.w > CUTOFF_SQUARED)
return; return;
...@@ -203,7 +200,9 @@ extern "C" __global__ void computeElectrostatics( ...@@ -203,7 +200,9 @@ extern "C" __global__ void computeElectrostatics(
const real4* __restrict__ posq, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, const real4* __restrict__ posq, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags,
const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices, const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms, const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
const unsigned int* __restrict__ interactingAtoms,
#endif #endif
const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole,
const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) { const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
...@@ -255,7 +254,7 @@ extern "C" __global__ void computeElectrostatics( ...@@ -255,7 +254,7 @@ extern "C" __global__ void computeElectrostatics(
float d = computeDScaleFactor(polarizationGroup, j); float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j); float p = computePScaleFactor(covalent, polarizationGroup, j);
float m = computeMScaleFactor(covalent, j); float m = computeMScaleFactor(covalent, j);
computeOneInteraction(data, localData[tbx+j], true, d, p, m, 0.5f, energy, periodicBoxSize, invPeriodicBoxSize); computeOneInteraction(data, localData[tbx+j], true, d, p, m, 0.5f, energy, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
} }
} }
if (atom1 < NUM_ATOMS) if (atom1 < NUM_ATOMS)
...@@ -283,7 +282,7 @@ extern "C" __global__ void computeElectrostatics( ...@@ -283,7 +282,7 @@ extern "C" __global__ void computeElectrostatics(
float d = computeDScaleFactor(polarizationGroup, tj); float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj); float p = computePScaleFactor(covalent, polarizationGroup, tj);
float m = computeMScaleFactor(covalent, tj); float m = computeMScaleFactor(covalent, tj);
computeOneInteraction(data, localData[tbx+tj], true, d, p, m, 1, energy, periodicBoxSize, invPeriodicBoxSize); computeOneInteraction(data, localData[tbx+tj], true, d, p, m, 1, energy, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
} }
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
...@@ -386,7 +385,7 @@ extern "C" __global__ void computeElectrostatics( ...@@ -386,7 +385,7 @@ extern "C" __global__ void computeElectrostatics(
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = atomIndices[tbx+tj]; int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
computeOneInteraction(data, localData[tbx+tj], false, 1, 1, 1, 1, energy, periodicBoxSize, invPeriodicBoxSize); computeOneInteraction(data, localData[tbx+tj], false, 1, 1, 1, 1, energy, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
} }
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment