Unverified Commit ae686364 authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Improved support for devices without 64 bit atomics (#3737)

parent 48664a1f
...@@ -496,8 +496,6 @@ OpenCLContext::~OpenCLContext() { ...@@ -496,8 +496,6 @@ OpenCLContext::~OpenCLContext() {
void OpenCLContext::initialize() { void OpenCLContext::initialize() {
bonded->initialize(system); bonded->initialize(system);
numForceBuffers = std::max(numForceBuffers, (int) platformData.contexts.size()); numForceBuffers = std::max(numForceBuffers, (int) platformData.contexts.size());
numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
numForceBuffers = std::max(numForceBuffers, nonbonded->getNumForceBuffers());
int energyBufferSize = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()); int energyBufferSize = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
if (useDoublePrecision) { if (useDoublePrecision) {
forceBuffers.initialize<mm_double4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers"); forceBuffers.initialize<mm_double4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
......
...@@ -799,10 +799,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -799,10 +799,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
} }
pmeGrid1.initialize(cl, gridElements, 2*elementSize, "pmeGrid1"); pmeGrid1.initialize(cl, gridElements, 2*elementSize, "pmeGrid1");
pmeGrid2.initialize(cl, gridElements, 2*elementSize, "pmeGrid2"); pmeGrid2.initialize(cl, gridElements, 2*elementSize, "pmeGrid2");
if (cl.getSupports64BitGlobalAtomics()) cl.addAutoclearBuffer(pmeGrid2);
cl.addAutoclearBuffer(pmeGrid2);
else
cl.addAutoclearBuffer(pmeGrid1);
pmeBsplineModuliX.initialize(cl, gridSizeX, elementSize, "pmeBsplineModuliX"); pmeBsplineModuliX.initialize(cl, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY.initialize(cl, gridSizeY, elementSize, "pmeBsplineModuliY"); pmeBsplineModuliY.initialize(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ.initialize(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ"); pmeBsplineModuliZ.initialize(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
...@@ -823,7 +820,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -823,7 +820,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
dispersionFft = new OpenCLFFT3D(cl, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true); dispersionFft = new OpenCLFFT3D(cl, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>(); string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>();
bool isNvidia = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA"); bool isNvidia = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
usePmeQueue = (!cl.getPlatformData().disablePmeStream && !cl.getPlatformData().useCpuPme && cl.getSupports64BitGlobalAtomics() && isNvidia); usePmeQueue = (!cl.getPlatformData().disablePmeStream && !cl.getPlatformData().useCpuPme && isNvidia);
if (usePmeQueue) { if (usePmeQueue) {
pmeDefines["USE_PME_STREAM"] = "1"; pmeDefines["USE_PME_STREAM"] = "1";
pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice()); pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice());
...@@ -1082,7 +1079,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb ...@@ -1082,7 +1079,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
cl::Program program = cl.createProgram(CommonKernelSources::nonbondedParameters, paramsDefines); cl::Program program = cl.createProgram(CommonKernelSources::nonbondedParameters, paramsDefines);
computeParamsKernel = cl::Kernel(program, "computeParameters"); computeParamsKernel = cl::Kernel(program, "computeParameters");
computeExclusionParamsKernel = cl::Kernel(program, "computeExclusionParameters"); computeExclusionParamsKernel = cl::Kernel(program, "computeExclusionParameters");
info = new ForceInfo(cl.getNonbondedUtilities().getNumForceBuffers(), force); info = new ForceInfo(0, force);
cl.addForce(info); cl.addForce(info);
} }
...@@ -1138,35 +1135,10 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1138,35 +1135,10 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4)); int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
pmeGridIndexKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer()); pmeGridIndexKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeGridIndexKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex.getDeviceBuffer()); pmeGridIndexKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex.getDeviceBuffer());
if (!cl.getSupports64BitGlobalAtomics()) {
pmeGridIndexKernel.setArg<cl::Buffer>(10, pmeBsplineTheta.getDeviceBuffer());
pmeGridIndexKernel.setArg(11, OpenCLContext::ThreadBlockSize*PmeOrder*elementSize, NULL);
pmeGridIndexKernel.setArg<cl::Buffer>(12, charges.getDeviceBuffer());
pmeAtomRangeKernel = cl::Kernel(program, "findAtomRangeForGrid");
pmeZIndexKernel = cl::Kernel(program, "recordZIndex");
pmeAtomRangeKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex.getDeviceBuffer());
pmeAtomRangeKernel.setArg<cl::Buffer>(1, pmeAtomRange.getDeviceBuffer());
pmeAtomRangeKernel.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
pmeZIndexKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex.getDeviceBuffer());
pmeZIndexKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
}
pmeSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid2.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid2.getDeviceBuffer()); pmeSpreadChargeKernel.setArg<cl::Buffer>(10, pmeAtomGridIndex.getDeviceBuffer());
else pmeSpreadChargeKernel.setArg<cl::Buffer>(11, charges.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid1.getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) {
pmeSpreadChargeKernel.setArg<cl::Buffer>(10, pmeAtomGridIndex.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(11, charges.getDeviceBuffer());
}
else if (deviceIsCpu)
pmeSpreadChargeKernel.setArg<cl::Buffer>(10, charges.getDeviceBuffer());
else {
pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeAtomGridIndex.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeAtomRange.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(5, charges.getDeviceBuffer());
}
pmeConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(1, pmeBsplineModuliX.getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(1, pmeBsplineModuliX.getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliY.getDeviceBuffer()); pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliY.getDeviceBuffer());
...@@ -1181,11 +1153,9 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1181,11 +1153,9 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid1.getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid1.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(11, pmeAtomGridIndex.getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(11, pmeAtomGridIndex.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(12, charges.getDeviceBuffer()); pmeInterpolateForceKernel.setArg<cl::Buffer>(12, charges.getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) { pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge"); pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer()); pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid1.getDeviceBuffer());
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid1.getDeviceBuffer());
}
if (usePmeQueue) if (usePmeQueue)
syncQueue->setKernel(cl::Kernel(program, "addEnergy")); syncQueue->setKernel(cl::Kernel(program, "addEnergy"));
...@@ -1206,38 +1176,12 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1206,38 +1176,12 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeDispersionConvolutionKernel = cl::Kernel(program, "reciprocalConvolution"); pmeDispersionConvolutionKernel = cl::Kernel(program, "reciprocalConvolution");
pmeDispersionEvalEnergyKernel = cl::Kernel(program, "gridEvaluateEnergy"); pmeDispersionEvalEnergyKernel = cl::Kernel(program, "gridEvaluateEnergy");
pmeDispersionInterpolateForceKernel = cl::Kernel(program, "gridInterpolateForce"); pmeDispersionInterpolateForceKernel = cl::Kernel(program, "gridInterpolateForce");
int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
pmeDispersionGridIndexKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer()); pmeDispersionGridIndexKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeDispersionGridIndexKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex.getDeviceBuffer()); pmeDispersionGridIndexKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex.getDeviceBuffer());
if (!cl.getSupports64BitGlobalAtomics()) {
pmeDispersionGridIndexKernel.setArg<cl::Buffer>(10, pmeBsplineTheta.getDeviceBuffer());
pmeDispersionGridIndexKernel.setArg(11, OpenCLContext::ThreadBlockSize*PmeOrder*elementSize, NULL);
pmeDispersionGridIndexKernel.setArg<cl::Buffer>(12, sigmaEpsilon.getDeviceBuffer());
pmeDispersionAtomRangeKernel = cl::Kernel(program, "findAtomRangeForGrid");
pmeDispersionZIndexKernel = cl::Kernel(program, "recordZIndex");
pmeDispersionAtomRangeKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex.getDeviceBuffer());
pmeDispersionAtomRangeKernel.setArg<cl::Buffer>(1, pmeAtomRange.getDeviceBuffer());
pmeDispersionAtomRangeKernel.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
pmeDispersionZIndexKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex.getDeviceBuffer());
pmeDispersionZIndexKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
}
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer()); pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid2.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid2.getDeviceBuffer()); pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(10, pmeAtomGridIndex.getDeviceBuffer());
else pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(11, sigmaEpsilon.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid1.getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) {
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(10, pmeAtomGridIndex.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(11, sigmaEpsilon.getDeviceBuffer());
}
else if (deviceIsCpu)
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(10, sigmaEpsilon.getDeviceBuffer());
else {
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(2, pmeAtomGridIndex.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(3, pmeAtomRange.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(5, sigmaEpsilon.getDeviceBuffer());
}
pmeDispersionConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer()); pmeDispersionConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeDispersionConvolutionKernel.setArg<cl::Buffer>(1, pmeDispersionBsplineModuliX.getDeviceBuffer()); pmeDispersionConvolutionKernel.setArg<cl::Buffer>(1, pmeDispersionBsplineModuliX.getDeviceBuffer());
pmeDispersionConvolutionKernel.setArg<cl::Buffer>(2, pmeDispersionBsplineModuliY.getDeviceBuffer()); pmeDispersionConvolutionKernel.setArg<cl::Buffer>(2, pmeDispersionBsplineModuliY.getDeviceBuffer());
...@@ -1252,11 +1196,9 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1252,11 +1196,9 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid1.getDeviceBuffer()); pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid1.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(11, pmeAtomGridIndex.getDeviceBuffer()); pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(11, pmeAtomGridIndex.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(12, sigmaEpsilon.getDeviceBuffer()); pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(12, sigmaEpsilon.getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) { pmeDispersionFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeDispersionFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge"); pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer()); pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid1.getDeviceBuffer());
pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid1.getDeviceBuffer());
}
} }
} }
} }
...@@ -1339,48 +1281,20 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1339,48 +1281,20 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeGridIndexKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[2]); pmeGridIndexKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[2]);
} }
cl.executeKernel(pmeGridIndexKernel, cl.getNumAtoms()); cl.executeKernel(pmeGridIndexKernel, cl.getNumAtoms());
if (deviceIsCpu && !cl.getSupports64BitGlobalAtomics()) { sort->sort(pmeAtomGridIndex);
setPeriodicBoxArgs(cl, pmeSpreadChargeKernel, 2); setPeriodicBoxArgs(cl, pmeSpreadChargeKernel, 2);
if (cl.getUseDoublePrecision()) { if (cl.getUseDoublePrecision()) {
pmeSpreadChargeKernel.setArg<mm_double4>(7, recipBoxVectors[0]); pmeSpreadChargeKernel.setArg<mm_double4>(7, recipBoxVectors[0]);
pmeSpreadChargeKernel.setArg<mm_double4>(8, recipBoxVectors[1]); pmeSpreadChargeKernel.setArg<mm_double4>(8, recipBoxVectors[1]);
pmeSpreadChargeKernel.setArg<mm_double4>(9, recipBoxVectors[2]); pmeSpreadChargeKernel.setArg<mm_double4>(9, recipBoxVectors[2]);
}
else {
pmeSpreadChargeKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[0]);
pmeSpreadChargeKernel.setArg<mm_float4>(8, recipBoxVectorsFloat[1]);
pmeSpreadChargeKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[2]);
}
cl.executeKernel(pmeSpreadChargeKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
} }
else { else {
sort->sort(pmeAtomGridIndex); pmeSpreadChargeKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[0]);
if (cl.getSupports64BitGlobalAtomics()) { pmeSpreadChargeKernel.setArg<mm_float4>(8, recipBoxVectorsFloat[1]);
setPeriodicBoxArgs(cl, pmeSpreadChargeKernel, 2); pmeSpreadChargeKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[2]);
if (cl.getUseDoublePrecision()) {
pmeSpreadChargeKernel.setArg<mm_double4>(7, recipBoxVectors[0]);
pmeSpreadChargeKernel.setArg<mm_double4>(8, recipBoxVectors[1]);
pmeSpreadChargeKernel.setArg<mm_double4>(9, recipBoxVectors[2]);
}
else {
pmeSpreadChargeKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[0]);
pmeSpreadChargeKernel.setArg<mm_float4>(8, recipBoxVectorsFloat[1]);
pmeSpreadChargeKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[2]);
}
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
cl.executeKernel(pmeFinishSpreadChargeKernel, gridSizeX*gridSizeY*gridSizeZ);
}
else {
cl.executeKernel(pmeAtomRangeKernel, cl.getNumAtoms());
setPeriodicBoxSizeArg(cl, pmeZIndexKernel, 2);
if (cl.getUseDoublePrecision())
pmeZIndexKernel.setArg<mm_double4>(3, recipBoxVectors[2]);
else
pmeZIndexKernel.setArg<mm_float4>(3, recipBoxVectorsFloat[2]);
cl.executeKernel(pmeZIndexKernel, cl.getNumAtoms());
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
}
} }
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
cl.executeKernel(pmeFinishSpreadChargeKernel, gridSizeX*gridSizeY*gridSizeZ);
fft->execFFT(pmeGrid1, pmeGrid2, true); fft->execFFT(pmeGrid1, pmeGrid2, true);
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble(); mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
if (cl.getUseDoublePrecision()) { if (cl.getUseDoublePrecision()) {
...@@ -1433,55 +1347,23 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ ...@@ -1433,55 +1347,23 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeDispersionGridIndexKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[2]); pmeDispersionGridIndexKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[2]);
} }
cl.executeKernel(pmeDispersionGridIndexKernel, cl.getNumAtoms()); cl.executeKernel(pmeDispersionGridIndexKernel, cl.getNumAtoms());
if (deviceIsCpu && !cl.getSupports64BitGlobalAtomics()) { if (!hasCoulomb)
cl.clearBuffer(pmeGrid1); sort->sort(pmeAtomGridIndex);
setPeriodicBoxArgs(cl, pmeDispersionSpreadChargeKernel, 2); cl.clearBuffer(pmeGrid2);
if (cl.getUseDoublePrecision()) { setPeriodicBoxArgs(cl, pmeDispersionSpreadChargeKernel, 2);
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(7, recipBoxVectors[0]); if (cl.getUseDoublePrecision()) {
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(8, recipBoxVectors[1]); pmeDispersionSpreadChargeKernel.setArg<mm_double4>(7, recipBoxVectors[0]);
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(9, recipBoxVectors[2]); pmeDispersionSpreadChargeKernel.setArg<mm_double4>(8, recipBoxVectors[1]);
} pmeDispersionSpreadChargeKernel.setArg<mm_double4>(9, recipBoxVectors[2]);
else {
pmeDispersionSpreadChargeKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[0]);
pmeDispersionSpreadChargeKernel.setArg<mm_float4>(8, recipBoxVectorsFloat[1]);
pmeDispersionSpreadChargeKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[2]);
}
cl.executeKernel(pmeDispersionSpreadChargeKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
} }
else { else {
if (cl.getSupports64BitGlobalAtomics()) { pmeDispersionSpreadChargeKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[0]);
if (!hasCoulomb) pmeDispersionSpreadChargeKernel.setArg<mm_float4>(8, recipBoxVectorsFloat[1]);
sort->sort(pmeAtomGridIndex); pmeDispersionSpreadChargeKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[2]);
cl.clearBuffer(pmeGrid2);
setPeriodicBoxArgs(cl, pmeDispersionSpreadChargeKernel, 2);
if (cl.getUseDoublePrecision()) {
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(7, recipBoxVectors[0]);
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(8, recipBoxVectors[1]);
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(9, recipBoxVectors[2]);
}
else {
pmeDispersionSpreadChargeKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[0]);
pmeDispersionSpreadChargeKernel.setArg<mm_float4>(8, recipBoxVectorsFloat[1]);
pmeDispersionSpreadChargeKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[2]);
}
cl.executeKernel(pmeDispersionSpreadChargeKernel, cl.getNumAtoms());
cl.executeKernel(pmeDispersionFinishSpreadChargeKernel, gridSizeX*gridSizeY*gridSizeZ);
}
else {
sort->sort(pmeAtomGridIndex);
cl.clearBuffer(pmeGrid1);
cl.executeKernel(pmeDispersionAtomRangeKernel, cl.getNumAtoms());
setPeriodicBoxSizeArg(cl, pmeDispersionZIndexKernel, 2);
if (cl.getUseDoublePrecision())
pmeDispersionZIndexKernel.setArg<mm_double4>(3, recipBoxVectors[2]);
else
pmeDispersionZIndexKernel.setArg<mm_float4>(3, recipBoxVectorsFloat[2]);
cl.executeKernel(pmeDispersionZIndexKernel, cl.getNumAtoms());
cl.executeKernel(pmeDispersionSpreadChargeKernel, cl.getNumAtoms());
}
} }
cl.executeKernel(pmeDispersionSpreadChargeKernel, cl.getNumAtoms());
cl.executeKernel(pmeDispersionFinishSpreadChargeKernel, gridSizeX*gridSizeY*gridSizeZ);
dispersionFft->execFFT(pmeGrid1, pmeGrid2, true); dispersionFft->execFFT(pmeGrid1, pmeGrid2, true);
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
if (cl.getUseDoublePrecision()) { if (cl.getUseDoublePrecision()) {
pmeDispersionConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]); pmeDispersionConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
pmeDispersionConvolutionKernel.setArg<mm_double4>(5, recipBoxVectors[1]); pmeDispersionConvolutionKernel.setArg<mm_double4>(5, recipBoxVectors[1]);
......
...@@ -56,38 +56,21 @@ private: ...@@ -56,38 +56,21 @@ private:
}; };
OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true), OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
numForceBuffers(0), blockSorter(NULL), pinnedCountBuffer(NULL), pinnedCountMemory(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) { blockSorter(NULL), pinnedCountBuffer(NULL), pinnedCountMemory(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
// Decide how many thread blocks and force buffers to use. // Decide how many thread blocks and force buffers to use.
deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU); deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
if (deviceIsCpu) { if (deviceIsCpu) {
numForceThreadBlocks = context.getNumThreadBlocks(); numForceThreadBlocks = context.getNumThreadBlocks();
forceThreadBlockSize = 1; forceThreadBlockSize = 1;
numForceBuffers = numForceThreadBlocks;
} }
else if (context.getSIMDWidth() == 32) { else if (context.getSIMDWidth() == 32) {
if (context.getSupports64BitGlobalAtomics()) {
numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
forceThreadBlockSize = 256; forceThreadBlockSize = 256;
// Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels.
numForceBuffers = 1;
}
else {
numForceThreadBlocks = 3*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
forceThreadBlockSize = 256;
numForceBuffers = numForceThreadBlocks*forceThreadBlockSize/OpenCLContext::TileSize;
}
} }
else { else {
numForceThreadBlocks = context.getNumThreadBlocks(); numForceThreadBlocks = context.getNumThreadBlocks();
forceThreadBlockSize = (context.getSIMDWidth() >= 32 ? OpenCLContext::ThreadBlockSize : 32); forceThreadBlockSize = (context.getSIMDWidth() >= 32 ? OpenCLContext::ThreadBlockSize : 32);
if (context.getSupports64BitGlobalAtomics()) {
// Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels.
numForceBuffers = 1;
}
else {
numForceBuffers = numForceThreadBlocks*forceThreadBlockSize/OpenCLContext::TileSize;
}
} }
pinnedCountBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, sizeof(unsigned int)); pinnedCountBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, sizeof(unsigned int));
pinnedCountMemory = (unsigned int*) context.getQueue().enqueueMapBuffer(*pinnedCountBuffer, CL_TRUE, CL_MAP_READ, 0, sizeof(int)); pinnedCountMemory = (unsigned int*) context.getQueue().enqueueMapBuffer(*pinnedCountBuffer, CL_TRUE, CL_MAP_READ, 0, sizeof(int));
...@@ -724,10 +707,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -724,10 +707,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
// Set arguments to the Kernel. // Set arguments to the Kernel.
int index = 0; int index = 0;
if (context.getSupports64BitGlobalAtomics()) kernel.setArg<cl::Memory>(index++, context.getLongForceBuffer().getDeviceBuffer());
kernel.setArg<cl::Memory>(index++, context.getLongForceBuffer().getDeviceBuffer());
else
kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusions.getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, exclusions.getDeviceBuffer());
......
...@@ -4,8 +4,24 @@ ...@@ -4,8 +4,24 @@
*/ */
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef cl_khr_int64_base_atomics
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#else
void atom_add(volatile __global unsigned long* p, long unsigned val) {
volatile __global unsigned int* word = (volatile __global unsigned int*) p;
#ifdef __ENDIAN_LITTLE__
int lowIndex = 0;
#else
int lowIndex = 1;
#endif
unsigned int lower = val;
unsigned int upper = val >> 32;
unsigned int result = atomic_add(&word[lowIndex], lower);
int carry = (lower + (unsigned long) result >= 0x100000000 ? 1 : 0);
upper += carry;
if (upper != 0)
atomic_add(&word[1-lowIndex], upper);
}
#endif #endif
#define KERNEL __kernel #define KERNEL __kernel
......
...@@ -235,7 +235,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi ...@@ -235,7 +235,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
unsigned int tilesToStore = neighborsInBuffer/TILE_SIZE; unsigned int tilesToStore = neighborsInBuffer/TILE_SIZE;
if (indexInWarp == 0) if (indexInWarp == 0)
*tileStartIndex = atom_add(interactionCount, tilesToStore); *tileStartIndex = ATOMIC_ADD(interactionCount, tilesToStore);
SYNC_WARPS; SYNC_WARPS;
unsigned int newTileStartIndex = *tileStartIndex; unsigned int newTileStartIndex = *tileStartIndex;
if (newTileStartIndex+tilesToStore <= maxTiles) { if (newTileStartIndex+tilesToStore <= maxTiles) {
...@@ -260,7 +260,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi ...@@ -260,7 +260,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
if (neighborsInBuffer > 0) { if (neighborsInBuffer > 0) {
unsigned int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE; unsigned int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
if (indexInWarp == 0) if (indexInWarp == 0)
*tileStartIndex = atom_add(interactionCount, tilesToStore); *tileStartIndex = ATOMIC_ADD(interactionCount, tilesToStore);
SYNC_WARPS; SYNC_WARPS;
unsigned int newTileStartIndex = *tileStartIndex; unsigned int newTileStartIndex = *tileStartIndex;
if (newTileStartIndex+tilesToStore <= maxTiles) { if (newTileStartIndex+tilesToStore <= maxTiles) {
...@@ -406,7 +406,7 @@ void storeInteractionData(int x, __local int* buffer, __local int* sum, __local ...@@ -406,7 +406,7 @@ void storeInteractionData(int x, __local int* buffer, __local int* sum, __local
int tilesToStore = (storePartialTile ? (atomsToStore+TILE_SIZE-1)/TILE_SIZE : atomsToStore/TILE_SIZE); int tilesToStore = (storePartialTile ? (atomsToStore+TILE_SIZE-1)/TILE_SIZE : atomsToStore/TILE_SIZE);
if (tilesToStore > 0) { if (tilesToStore > 0) {
if (get_local_id(0) == 0) if (get_local_id(0) == 0)
*baseIndex = atom_add(interactionCount, tilesToStore); *baseIndex = ATOMIC_ADD(interactionCount, tilesToStore);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) == 0) if (get_local_id(0) == 0)
*numAtoms = atomsToStore-tilesToStore*TILE_SIZE; *numAtoms = atomsToStore-tilesToStore*TILE_SIZE;
...@@ -432,7 +432,7 @@ void storeInteractionData(int x, __local int* buffer, __local int* sum, __local ...@@ -432,7 +432,7 @@ void storeInteractionData(int x, __local int* buffer, __local int* sum, __local
// previous call to this function. Save them now. // previous call to this function. Save them now.
if (get_local_id(0) == 0) if (get_local_id(0) == 0)
*baseIndex = atom_add(interactionCount, 1); *baseIndex = ATOMIC_ADD(interactionCount, 1);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (*baseIndex < maxTiles) { if (*baseIndex < maxTiles) {
if (get_local_id(0) == 0) if (get_local_id(0) == 0)
......
...@@ -125,7 +125,7 @@ void storeInteractionData(int x, int* buffer, int* atoms, int* numAtoms, int num ...@@ -125,7 +125,7 @@ void storeInteractionData(int x, int* buffer, int* atoms, int* numAtoms, int num
// The atoms buffer is full, so store it to global memory. // The atoms buffer is full, so store it to global memory.
int tilesToStore = BUFFER_SIZE/TILE_SIZE; int tilesToStore = BUFFER_SIZE/TILE_SIZE;
int baseIndex = atom_add(interactionCount, tilesToStore); int baseIndex = ATOMIC_ADD(interactionCount, tilesToStore);
if (baseIndex+tilesToStore <= maxTiles) { if (baseIndex+tilesToStore <= maxTiles) {
for (int i = 0; i < tilesToStore; i++) { for (int i = 0; i < tilesToStore; i++) {
interactingTiles[baseIndex+i] = x; interactingTiles[baseIndex+i] = x;
...@@ -142,7 +142,7 @@ void storeInteractionData(int x, int* buffer, int* atoms, int* numAtoms, int num ...@@ -142,7 +142,7 @@ void storeInteractionData(int x, int* buffer, int* atoms, int* numAtoms, int num
// There are some leftover atoms, so save them now. // There are some leftover atoms, so save them now.
int tilesToStore = (*numAtoms+TILE_SIZE-1)/TILE_SIZE; int tilesToStore = (*numAtoms+TILE_SIZE-1)/TILE_SIZE;
int baseIndex = atom_add(interactionCount, tilesToStore); int baseIndex = ATOMIC_ADD(interactionCount, tilesToStore);
if (baseIndex+tilesToStore <= maxTiles) { if (baseIndex+tilesToStore <= maxTiles) {
for (int i = 0; i < tilesToStore; i++) { for (int i = 0; i < tilesToStore; i++) {
interactingTiles[baseIndex+i] = x; interactingTiles[baseIndex+i] = x;
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
typedef struct { typedef struct {
...@@ -17,11 +14,7 @@ typedef struct { ...@@ -17,11 +14,7 @@ typedef struct {
* Compute nonbonded interactions. * Compute nonbonded interactions.
*/ */
__kernel void computeNonbonded( __kernel void computeNonbonded(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict forceBuffers,
#else
__global real4* restrict forceBuffers,
#endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions, __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const int2* restrict exclusionTiles, unsigned int startTileIndex, unsigned long numTileIndices __global const int2* restrict exclusionTiles, unsigned int startTileIndex, unsigned long numTileIndices
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -176,24 +169,16 @@ __kernel void computeNonbonded( ...@@ -176,24 +169,16 @@ __kernel void computeNonbonded(
// Write results. // Write results.
#ifdef INCLUDE_FORCES #ifdef INCLUDE_FORCES
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx; unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
if (x != y) { if (x != y) {
offset = y*TILE_SIZE + tgx; offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], realToFixedPoint(localData[get_local_id(0)].fx)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(localData[get_local_id(0)].fx));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], realToFixedPoint(localData[get_local_id(0)].fy)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[get_local_id(0)].fy));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[get_local_id(0)].fz)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[get_local_id(0)].fz));
} }
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
if (x != y)
forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
#endif
#endif #endif
} }
...@@ -409,22 +394,14 @@ __kernel void computeNonbonded( ...@@ -409,22 +394,14 @@ __kernel void computeNonbonded(
#else #else
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
#ifdef SUPPORTS_64_BIT_ATOMICS ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
atom_add(&forceBuffers[atom1], realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
atom_add(&forceBuffers[atom2], realToFixedPoint(localData[get_local_id(0)].fx)); ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) realToFixedPoint(localData[get_local_id(0)].fx));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], realToFixedPoint(localData[get_local_id(0)].fy)); ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[get_local_id(0)].fy));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[get_local_id(0)].fz)); ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[get_local_id(0)].fz));
} }
#else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
if (atom2 < PADDED_NUM_ATOMS)
forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
#endif
#endif #endif
} }
pos++; pos++;
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
typedef struct { typedef struct {
real x, y, z; real x, y, z;
real q; real q;
...@@ -107,9 +103,9 @@ __kernel void computeNonbonded( ...@@ -107,9 +103,9 @@ __kernel void computeNonbonded(
// Write results. // Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz; forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
...@@ -183,9 +179,9 @@ __kernel void computeNonbonded( ...@@ -183,9 +179,9 @@ __kernel void computeNonbonded(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz; forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
...@@ -197,9 +193,9 @@ __kernel void computeNonbonded( ...@@ -197,9 +193,9 @@ __kernel void computeNonbonded(
for (int tgx = 0; tgx < TILE_SIZE; tgx++) { for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE + tgx; unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], realToFixedPoint(localData[tgx].fx)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(localData[tgx].fx));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fy)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fy));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fz)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fz));
#else #else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset]; real4 f = forceBuffers[offset];
...@@ -342,9 +338,9 @@ __kernel void computeNonbonded( ...@@ -342,9 +338,9 @@ __kernel void computeNonbonded(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz; forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
...@@ -409,9 +405,9 @@ __kernel void computeNonbonded( ...@@ -409,9 +405,9 @@ __kernel void computeNonbonded(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], realToFixedPoint(force.x)); ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz; forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
...@@ -429,9 +425,9 @@ __kernel void computeNonbonded( ...@@ -429,9 +425,9 @@ __kernel void computeNonbonded(
#endif #endif
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom2], realToFixedPoint(localData[tgx].fx)); ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) realToFixedPoint(localData[tgx].fx));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fy)); ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fy));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fz)); ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fz));
#else #else
unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset]; real4 f = forceBuffers[offset];
......
...@@ -218,8 +218,6 @@ CommonCalcAmoebaMultipoleForceKernel::~CommonCalcAmoebaMultipoleForceKernel() { ...@@ -218,8 +218,6 @@ CommonCalcAmoebaMultipoleForceKernel::~CommonCalcAmoebaMultipoleForceKernel() {
void CommonCalcAmoebaMultipoleForceKernel::initialize(const System& system, const AmoebaMultipoleForce& force) { void CommonCalcAmoebaMultipoleForceKernel::initialize(const System& system, const AmoebaMultipoleForce& force) {
ContextSelector selector(cc); ContextSelector selector(cc);
if (!cc.getSupports64BitGlobalAtomics())
throw OpenMMException("AmoebaMultipoleForce requires a device that supports 64 bit atomic operations");
// Initialize multipole parameters. // Initialize multipole parameters.
...@@ -2367,8 +2365,6 @@ CommonCalcHippoNonbondedForceKernel::CommonCalcHippoNonbondedForceKernel(const s ...@@ -2367,8 +2365,6 @@ CommonCalcHippoNonbondedForceKernel::CommonCalcHippoNonbondedForceKernel(const s
void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const HippoNonbondedForce& force) { void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const HippoNonbondedForce& force) {
ContextSelector selector(cc); ContextSelector selector(cc);
if (!cc.getSupports64BitGlobalAtomics())
throw OpenMMException("HippoNonbondedForce requires a device that supports 64 bit atomic operations");
extrapolationCoefficients = force.getExtrapolationCoefficients(); extrapolationCoefficients = force.getExtrapolationCoefficients();
usePME = (force.getNonbondedMethod() == HippoNonbondedForce::PME); usePME = (force.getNonbondedMethod() == HippoNonbondedForce::PME);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment