Commit de42858f authored by peastman's avatar peastman
Browse files

Bug fixes

parent 402e01b2
...@@ -1668,7 +1668,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1668,7 +1668,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
cosSinSums.initialize(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums"); cosSinSums.initialize(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
} }
} }
else if ((nonbondedMethod == PME && hasCoulomb) || doLJPME) { else if (((nonbondedMethod == PME || nonbondedMethod == LJPME) && hasCoulomb) || doLJPME) {
// Compute the PME parameters. // Compute the PME parameters.
NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ, false); NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
...@@ -1934,37 +1934,34 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1934,37 +1934,34 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines); string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
charges.initialize(cu, cu.getPaddedNumAtoms(), cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "charges"); charges.initialize(cu, cu.getPaddedNumAtoms(), cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "charges");
if (hasCoulomb) { map<string, string> replacements;
map<string, string> replacements; if (usePosqCharges) {
if (usePosqCharges) { cu.setCharges(chargeVec);
cu.setCharges(chargeVec); replacements["CHARGE1"] = "posq1.w";
replacements["CHARGE1"] = "posq1.w"; replacements["CHARGE2"] = "posq2.w";
replacements["CHARGE2"] = "posq2.w"; }
} else {
if (cu.getUseDoublePrecision())
charges.upload(chargeVec);
else { else {
if (cu.getUseDoublePrecision()) vector<float> c(charges.getSize());
charges.upload(chargeVec); for (int i = 0; i < c.size(); i++)
else { c[i] = (float) chargeVec[i];
vector<float> c(charges.getSize()); charges.upload(c);
for (int i = 0; i < c.size(); i++)
c[i] = (float) chargeVec[i];
charges.upload(c);
}
replacements["CHARGE1"] = prefix+"charge1";
replacements["CHARGE2"] = prefix+"charge2";
} }
source = cu.replaceStrings(source, replacements); replacements["CHARGE1"] = prefix+"charge1";
cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo(prefix+"charge", "real", 1, charges.getElementSize(), charges.getDevicePointer())); replacements["CHARGE2"] = prefix+"charge2";
} }
if (hasCoulomb)
cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo(prefix+"charge", "real", 1, charges.getElementSize(), charges.getDevicePointer()));
if (hasLJ) { if (hasLJ) {
sigmaEpsilon.initialize<float2>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon"); sigmaEpsilon.initialize<float2>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon");
sigmaEpsilon.upload(sigmaEpsilonVector); sigmaEpsilon.upload(sigmaEpsilonVector);
map<string, string> replacements;
replacements["SIGMA_EPSILON1"] = prefix+"sigmaEpsilon1"; replacements["SIGMA_EPSILON1"] = prefix+"sigmaEpsilon1";
replacements["SIGMA_EPSILON2"] = prefix+"sigmaEpsilon2"; replacements["SIGMA_EPSILON2"] = prefix+"sigmaEpsilon2";
source = cu.replaceStrings(source, replacements);
cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo(prefix+"sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon.getDevicePointer())); cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo(prefix+"sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon.getDevicePointer()));
} }
source = cu.replaceStrings(source, replacements);
cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true); cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
// Initialize the exceptions. // Initialize the exceptions.
...@@ -2032,65 +2029,67 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -2032,65 +2029,67 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
// Execute the reciprocal space kernels. // Execute the reciprocal space kernels.
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(), if (hasCoulomb) {
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms()); recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
sort->sort(pmeAtomGridIndex); sort->sort(pmeAtomGridIndex);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &directPmeGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(), void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &directPmeGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()}; &charges.getDevicePointer()};
cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128); cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) { if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) {
void* finishSpreadArgs[] = {&directPmeGrid.getDevicePointer()}; void* finishSpreadArgs[] = {&directPmeGrid.getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256); cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
} }
if (useCudaFFT) { if (useCudaFFT) {
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecD2Z(fftForward, (double*) directPmeGrid.getDevicePointer(), (double2*) reciprocalPmeGrid.getDevicePointer()); cufftExecD2Z(fftForward, (double*) directPmeGrid.getDevicePointer(), (double2*) reciprocalPmeGrid.getDevicePointer());
else else
cufftExecR2C(fftForward, (float*) directPmeGrid.getDevicePointer(), (float2*) reciprocalPmeGrid.getDevicePointer()); cufftExecR2C(fftForward, (float*) directPmeGrid.getDevicePointer(), (float2*) reciprocalPmeGrid.getDevicePointer());
} }
else { else {
fft->execFFT(directPmeGrid, reciprocalPmeGrid, true); fft->execFFT(directPmeGrid, reciprocalPmeGrid, true);
} }
if (includeEnergy) { if (includeEnergy) {
void* computeEnergyArgs[] = {&reciprocalPmeGrid.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(), void* computeEnergyArgs[] = {&reciprocalPmeGrid.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, gridSizeX*gridSizeY*gridSizeZ);
}
void* convolutionArgs[] = {&reciprocalPmeGrid.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(), &pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, gridSizeX*gridSizeY*gridSizeZ); cu.executeKernel(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
}
void* convolutionArgs[] = {&reciprocalPmeGrid.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), if (useCudaFFT) {
&pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(), if (cu.getUseDoublePrecision())
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; cufftExecZ2D(fftBackward, (double2*) reciprocalPmeGrid.getDevicePointer(), (double*) directPmeGrid.getDevicePointer());
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256); else
cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid.getDevicePointer(), (float*) directPmeGrid.getDevicePointer());
}
else {
fft->execFFT(reciprocalPmeGrid, directPmeGrid, false);
}
if (useCudaFFT) { void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
if (cu.getUseDoublePrecision()) cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
cufftExecZ2D(fftBackward, (double2*) reciprocalPmeGrid.getDevicePointer(), (double*) directPmeGrid.getDevicePointer()); recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
else &charges.getDevicePointer()};
cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid.getDevicePointer(), (float*) directPmeGrid.getDevicePointer()); cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
}
else {
fft->execFFT(reciprocalPmeGrid, directPmeGrid, false);
} }
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()};
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
// As written, we check only the Electrostatic grid pointer to get here. We could separate them out, but for // As written, we check only the Electrostatic grid pointer to get here. We could separate them out, but for
// now we assume that LJPME can only be used if electrostatic PME is also active. // now we assume that LJPME can only be used if electrostatic PME is also active.
if (doLJPME) { if (doLJPME && hasLJ) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(), void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
......
...@@ -1657,7 +1657,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1657,7 +1657,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
cosSinSums.initialize(cl, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums"); cosSinSums.initialize(cl, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
} }
} }
else if ((nonbondedMethod == PME && hasCoulomb) || doLJPME) { else if (((nonbondedMethod == PME || nonbondedMethod == LJPME) && hasCoulomb) || doLJPME) {
// Compute the PME parameters. // Compute the PME parameters.
NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ, false); NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
...@@ -1865,37 +1865,34 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1865,37 +1865,34 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
string source = cl.replaceStrings(OpenCLKernelSources::coulombLennardJones, defines); string source = cl.replaceStrings(OpenCLKernelSources::coulombLennardJones, defines);
charges.initialize(cl, cl.getPaddedNumAtoms(), cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "charges"); charges.initialize(cl, cl.getPaddedNumAtoms(), cl.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "charges");
if (hasCoulomb) { map<string, string> replacements;
map<string, string> replacements; if (usePosqCharges) {
if (usePosqCharges) { cl.setCharges(chargeVec);
cl.setCharges(chargeVec); replacements["CHARGE1"] = "posq1.w";
replacements["CHARGE1"] = "posq1.w"; replacements["CHARGE2"] = "posq2.w";
replacements["CHARGE2"] = "posq2.w"; }
} else {
if (cl.getUseDoublePrecision())
charges.upload(chargeVec);
else { else {
if (cl.getUseDoublePrecision()) vector<float> c(charges.getSize());
charges.upload(chargeVec); for (int i = 0; i < c.size(); i++)
else { c[i] = (float) chargeVec[i];
vector<float> c(charges.getSize()); charges.upload(c);
for (int i = 0; i < c.size(); i++)
c[i] = (float) chargeVec[i];
charges.upload(c);
}
replacements["CHARGE1"] = prefix+"charge1";
replacements["CHARGE2"] = prefix+"charge2";
} }
source = cl.replaceStrings(source, replacements); replacements["CHARGE1"] = prefix+"charge1";
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(prefix+"charge", "real", 1, charges.getElementSize(), charges.getDeviceBuffer())); replacements["CHARGE2"] = prefix+"charge2";
} }
if (hasCoulomb)
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(prefix+"charge", "real", 1, charges.getElementSize(), charges.getDeviceBuffer()));
if (hasLJ) { if (hasLJ) {
sigmaEpsilon.initialize<mm_float2>(cl, cl.getPaddedNumAtoms(), "sigmaEpsilon"); sigmaEpsilon.initialize<mm_float2>(cl, cl.getPaddedNumAtoms(), "sigmaEpsilon");
sigmaEpsilon.upload(sigmaEpsilonVector); sigmaEpsilon.upload(sigmaEpsilonVector);
map<string, string> replacements;
replacements["SIGMA_EPSILON1"] = prefix+"sigmaEpsilon1"; replacements["SIGMA_EPSILON1"] = prefix+"sigmaEpsilon1";
replacements["SIGMA_EPSILON2"] = prefix+"sigmaEpsilon2"; replacements["SIGMA_EPSILON2"] = prefix+"sigmaEpsilon2";
source = cl.replaceStrings(source, replacements);
cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(prefix+"sigmaEpsilon", "float", 2, sizeof(cl_float2), sigmaEpsilon.getDeviceBuffer())); cl.getNonbondedUtilities().addParameter(OpenCLNonbondedUtilities::ParameterInfo(prefix+"sigmaEpsilon", "float", 2, sizeof(cl_float2), sigmaEpsilon.getDeviceBuffer()));
} }
source = cl.replaceStrings(source, replacements);
cl.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup()); cl.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
// Initialize the exceptions. // Initialize the exceptions.
...@@ -2096,35 +2093,20 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -2096,35 +2093,20 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
// Execute the reciprocal space kernels. // Execute the reciprocal space kernels.
setPeriodicBoxArgs(cl, pmeUpdateBsplinesKernel, 4); if (hasCoulomb) {
if (cl.getUseDoublePrecision()) { setPeriodicBoxArgs(cl, pmeUpdateBsplinesKernel, 4);
pmeUpdateBsplinesKernel.setArg<mm_double4>(9, recipBoxVectors[0]);
pmeUpdateBsplinesKernel.setArg<mm_double4>(10, recipBoxVectors[1]);
pmeUpdateBsplinesKernel.setArg<mm_double4>(11, recipBoxVectors[2]);
}
else {
pmeUpdateBsplinesKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[0]);
pmeUpdateBsplinesKernel.setArg<mm_float4>(10, recipBoxVectorsFloat[1]);
pmeUpdateBsplinesKernel.setArg<mm_float4>(11, recipBoxVectorsFloat[2]);
}
cl.executeKernel(pmeUpdateBsplinesKernel, cl.getNumAtoms());
if (deviceIsCpu && !cl.getSupports64BitGlobalAtomics()) {
setPeriodicBoxArgs(cl, pmeSpreadChargeKernel, 5);
if (cl.getUseDoublePrecision()) { if (cl.getUseDoublePrecision()) {
pmeSpreadChargeKernel.setArg<mm_double4>(10, recipBoxVectors[0]); pmeUpdateBsplinesKernel.setArg<mm_double4>(9, recipBoxVectors[0]);
pmeSpreadChargeKernel.setArg<mm_double4>(11, recipBoxVectors[1]); pmeUpdateBsplinesKernel.setArg<mm_double4>(10, recipBoxVectors[1]);
pmeSpreadChargeKernel.setArg<mm_double4>(12, recipBoxVectors[2]); pmeUpdateBsplinesKernel.setArg<mm_double4>(11, recipBoxVectors[2]);
} }
else { else {
pmeSpreadChargeKernel.setArg<mm_float4>(10, recipBoxVectorsFloat[0]); pmeUpdateBsplinesKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[0]);
pmeSpreadChargeKernel.setArg<mm_float4>(11, recipBoxVectorsFloat[1]); pmeUpdateBsplinesKernel.setArg<mm_float4>(10, recipBoxVectorsFloat[1]);
pmeSpreadChargeKernel.setArg<mm_float4>(12, recipBoxVectorsFloat[2]); pmeUpdateBsplinesKernel.setArg<mm_float4>(11, recipBoxVectorsFloat[2]);
} }
cl.executeKernel(pmeSpreadChargeKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1); cl.executeKernel(pmeUpdateBsplinesKernel, cl.getNumAtoms());
} if (deviceIsCpu && !cl.getSupports64BitGlobalAtomics()) {
else {
sort->sort(pmeAtomGridIndex);
if (cl.getSupports64BitGlobalAtomics()) {
setPeriodicBoxArgs(cl, pmeSpreadChargeKernel, 5); setPeriodicBoxArgs(cl, pmeSpreadChargeKernel, 5);
if (cl.getUseDoublePrecision()) { if (cl.getUseDoublePrecision()) {
pmeSpreadChargeKernel.setArg<mm_double4>(10, recipBoxVectors[0]); pmeSpreadChargeKernel.setArg<mm_double4>(10, recipBoxVectors[0]);
...@@ -2136,59 +2118,76 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -2136,59 +2118,76 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeSpreadChargeKernel.setArg<mm_float4>(11, recipBoxVectorsFloat[1]); pmeSpreadChargeKernel.setArg<mm_float4>(11, recipBoxVectorsFloat[1]);
pmeSpreadChargeKernel.setArg<mm_float4>(12, recipBoxVectorsFloat[2]); pmeSpreadChargeKernel.setArg<mm_float4>(12, recipBoxVectorsFloat[2]);
} }
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms()); cl.executeKernel(pmeSpreadChargeKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
cl.executeKernel(pmeFinishSpreadChargeKernel, gridSizeX*gridSizeY*gridSizeZ);
} }
else { else {
cl.executeKernel(pmeAtomRangeKernel, cl.getNumAtoms()); sort->sort(pmeAtomGridIndex);
setPeriodicBoxSizeArg(cl, pmeZIndexKernel, 2); if (cl.getSupports64BitGlobalAtomics()) {
if (cl.getUseDoublePrecision()) setPeriodicBoxArgs(cl, pmeSpreadChargeKernel, 5);
pmeZIndexKernel.setArg<mm_double4>(3, recipBoxVectors[2]); if (cl.getUseDoublePrecision()) {
else pmeSpreadChargeKernel.setArg<mm_double4>(10, recipBoxVectors[0]);
pmeZIndexKernel.setArg<mm_float4>(3, recipBoxVectorsFloat[2]); pmeSpreadChargeKernel.setArg<mm_double4>(11, recipBoxVectors[1]);
cl.executeKernel(pmeZIndexKernel, cl.getNumAtoms()); pmeSpreadChargeKernel.setArg<mm_double4>(12, recipBoxVectors[2]);
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms()); }
else {
pmeSpreadChargeKernel.setArg<mm_float4>(10, recipBoxVectorsFloat[0]);
pmeSpreadChargeKernel.setArg<mm_float4>(11, recipBoxVectorsFloat[1]);
pmeSpreadChargeKernel.setArg<mm_float4>(12, recipBoxVectorsFloat[2]);
}
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
cl.executeKernel(pmeFinishSpreadChargeKernel, gridSizeX*gridSizeY*gridSizeZ);
}
else {
cl.executeKernel(pmeAtomRangeKernel, cl.getNumAtoms());
setPeriodicBoxSizeArg(cl, pmeZIndexKernel, 2);
if (cl.getUseDoublePrecision())
pmeZIndexKernel.setArg<mm_double4>(3, recipBoxVectors[2]);
else
pmeZIndexKernel.setArg<mm_float4>(3, recipBoxVectorsFloat[2]);
cl.executeKernel(pmeZIndexKernel, cl.getNumAtoms());
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
}
} }
fft->execFFT(pmeGrid, pmeGrid2, true);
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
if (cl.getUseDoublePrecision()) {
pmeConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
pmeConvolutionKernel.setArg<mm_double4>(5, recipBoxVectors[1]);
pmeConvolutionKernel.setArg<mm_double4>(6, recipBoxVectors[2]);
pmeEvalEnergyKernel.setArg<mm_double4>(5, recipBoxVectors[0]);
pmeEvalEnergyKernel.setArg<mm_double4>(6, recipBoxVectors[1]);
pmeEvalEnergyKernel.setArg<mm_double4>(7, recipBoxVectors[2]);
}
else {
pmeConvolutionKernel.setArg<mm_float4>(4, recipBoxVectorsFloat[0]);
pmeConvolutionKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[1]);
pmeConvolutionKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[2]);
pmeEvalEnergyKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[0]);
pmeEvalEnergyKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[1]);
pmeEvalEnergyKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]);
}
if (includeEnergy)
cl.executeKernel(pmeEvalEnergyKernel, gridSizeX*gridSizeY*gridSizeZ);
cl.executeKernel(pmeConvolutionKernel, gridSizeX*gridSizeY*gridSizeZ);
fft->execFFT(pmeGrid2, pmeGrid, false);
setPeriodicBoxArgs(cl, pmeInterpolateForceKernel, 3);
if (cl.getUseDoublePrecision()) {
pmeInterpolateForceKernel.setArg<mm_double4>(8, recipBoxVectors[0]);
pmeInterpolateForceKernel.setArg<mm_double4>(9, recipBoxVectors[1]);
pmeInterpolateForceKernel.setArg<mm_double4>(10, recipBoxVectors[2]);
}
else {
pmeInterpolateForceKernel.setArg<mm_float4>(8, recipBoxVectorsFloat[0]);
pmeInterpolateForceKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[1]);
pmeInterpolateForceKernel.setArg<mm_float4>(10, recipBoxVectorsFloat[2]);
}
if (deviceIsCpu)
cl.executeKernel(pmeInterpolateForceKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
else
cl.executeKernel(pmeInterpolateForceKernel, cl.getNumAtoms());
} }
fft->execFFT(pmeGrid, pmeGrid2, true);
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
if (cl.getUseDoublePrecision()) {
pmeConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
pmeConvolutionKernel.setArg<mm_double4>(5, recipBoxVectors[1]);
pmeConvolutionKernel.setArg<mm_double4>(6, recipBoxVectors[2]);
pmeEvalEnergyKernel.setArg<mm_double4>(5, recipBoxVectors[0]);
pmeEvalEnergyKernel.setArg<mm_double4>(6, recipBoxVectors[1]);
pmeEvalEnergyKernel.setArg<mm_double4>(7, recipBoxVectors[2]);
}
else {
pmeConvolutionKernel.setArg<mm_float4>(4, recipBoxVectorsFloat[0]);
pmeConvolutionKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[1]);
pmeConvolutionKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[2]);
pmeEvalEnergyKernel.setArg<mm_float4>(5, recipBoxVectorsFloat[0]);
pmeEvalEnergyKernel.setArg<mm_float4>(6, recipBoxVectorsFloat[1]);
pmeEvalEnergyKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]);
}
if (includeEnergy)
cl.executeKernel(pmeEvalEnergyKernel, gridSizeX*gridSizeY*gridSizeZ);
cl.executeKernel(pmeConvolutionKernel, gridSizeX*gridSizeY*gridSizeZ);
fft->execFFT(pmeGrid2, pmeGrid, false);
setPeriodicBoxArgs(cl, pmeInterpolateForceKernel, 3);
if (cl.getUseDoublePrecision()) {
pmeInterpolateForceKernel.setArg<mm_double4>(8, recipBoxVectors[0]);
pmeInterpolateForceKernel.setArg<mm_double4>(9, recipBoxVectors[1]);
pmeInterpolateForceKernel.setArg<mm_double4>(10, recipBoxVectors[2]);
}
else {
pmeInterpolateForceKernel.setArg<mm_float4>(8, recipBoxVectorsFloat[0]);
pmeInterpolateForceKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[1]);
pmeInterpolateForceKernel.setArg<mm_float4>(10, recipBoxVectorsFloat[2]);
}
if (deviceIsCpu)
cl.executeKernel(pmeInterpolateForceKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
else
cl.executeKernel(pmeInterpolateForceKernel, cl.getNumAtoms());
if (doLJPME) { if (doLJPME && hasLJ) {
setPeriodicBoxArgs(cl, pmeDispersionUpdateBsplinesKernel, 4); setPeriodicBoxArgs(cl, pmeDispersionUpdateBsplinesKernel, 4);
if (cl.getUseDoublePrecision()) { if (cl.getUseDoublePrecision()) {
pmeDispersionUpdateBsplinesKernel.setArg<mm_double4>(9, recipBoxVectors[0]); pmeDispersionUpdateBsplinesKernel.setArg<mm_double4>(9, recipBoxVectors[0]);
...@@ -2267,7 +2266,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -2267,7 +2266,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
if (includeEnergy) if (includeEnergy)
cl.executeKernel(pmeDispersionEvalEnergyKernel, gridSizeX*gridSizeY*gridSizeZ); cl.executeKernel(pmeDispersionEvalEnergyKernel, gridSizeX*gridSizeY*gridSizeZ);
cl.executeKernel(pmeDispersionConvolutionKernel, gridSizeX*gridSizeY*gridSizeZ); cl.executeKernel(pmeDispersionConvolutionKernel, gridSizeX*gridSizeY*gridSizeZ);
fft->execFFT(pmeGrid2, pmeGrid, false); dispersionFft->execFFT(pmeGrid2, pmeGrid, false);
setPeriodicBoxArgs(cl, pmeDispersionInterpolateForceKernel, 3); setPeriodicBoxArgs(cl, pmeDispersionInterpolateForceKernel, 3);
if (cl.getUseDoublePrecision()) { if (cl.getUseDoublePrecision()) {
pmeDispersionInterpolateForceKernel.setArg<mm_double4>(8, recipBoxVectors[0]); pmeDispersionInterpolateForceKernel.setArg<mm_double4>(8, recipBoxVectors[0]);
......
...@@ -18,7 +18,7 @@ __kernel void computeBornSum( ...@@ -18,7 +18,7 @@ __kernel void computeBornSum(
#else #else
__global real* restrict global_bornSum, __global real* restrict global_bornSum,
#endif #endif
__global const real4* restrict posq, , __global const real* restrict charge, __global const float2* restrict global_params, __global const real4* restrict posq, __global const real* restrict charge, __global const float2* restrict global_params,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
......
...@@ -1269,7 +1269,7 @@ void testWater2DpmeEnergiesForcesWithExclusions() { ...@@ -1269,7 +1269,7 @@ void testWater2DpmeEnergiesForcesWithExclusions() {
const vector<Vec3>& forces = state.getForces(); const vector<Vec3>& forces = state.getForces();
ASSERT_EQUAL_TOL(refenergy, energy, 1E-4); ASSERT_EQUAL_TOL(refenergy, energy, 5E-4);
for (int n = 0; n < numAtoms; ++n) for (int n = 0; n < numAtoms; ++n)
ASSERT_EQUAL_VEC(refforces[n], forces[n], 5E-4); ASSERT_EQUAL_VEC(refforces[n], forces[n], 5E-4);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment