Unverified Commit ae686364 authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Improved support for devices without 64 bit atomics (#3737)

parent 48664a1f
......@@ -496,8 +496,6 @@ OpenCLContext::~OpenCLContext() {
void OpenCLContext::initialize() {
bonded->initialize(system);
numForceBuffers = std::max(numForceBuffers, (int) platformData.contexts.size());
numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
numForceBuffers = std::max(numForceBuffers, nonbonded->getNumForceBuffers());
int energyBufferSize = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
if (useDoublePrecision) {
forceBuffers.initialize<mm_double4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
......
......@@ -799,10 +799,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
}
pmeGrid1.initialize(cl, gridElements, 2*elementSize, "pmeGrid1");
pmeGrid2.initialize(cl, gridElements, 2*elementSize, "pmeGrid2");
if (cl.getSupports64BitGlobalAtomics())
cl.addAutoclearBuffer(pmeGrid2);
else
cl.addAutoclearBuffer(pmeGrid1);
cl.addAutoclearBuffer(pmeGrid2);
pmeBsplineModuliX.initialize(cl, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY.initialize(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ.initialize(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
......@@ -823,7 +820,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
dispersionFft = new OpenCLFFT3D(cl, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
string vendor = cl.getDevice().getInfo<CL_DEVICE_VENDOR>();
bool isNvidia = (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA");
usePmeQueue = (!cl.getPlatformData().disablePmeStream && !cl.getPlatformData().useCpuPme && cl.getSupports64BitGlobalAtomics() && isNvidia);
usePmeQueue = (!cl.getPlatformData().disablePmeStream && !cl.getPlatformData().useCpuPme && isNvidia);
if (usePmeQueue) {
pmeDefines["USE_PME_STREAM"] = "1";
pmeQueue = cl::CommandQueue(cl.getContext(), cl.getDevice());
......@@ -1082,7 +1079,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
cl::Program program = cl.createProgram(CommonKernelSources::nonbondedParameters, paramsDefines);
computeParamsKernel = cl::Kernel(program, "computeParameters");
computeExclusionParamsKernel = cl::Kernel(program, "computeExclusionParameters");
info = new ForceInfo(cl.getNonbondedUtilities().getNumForceBuffers(), force);
info = new ForceInfo(0, force);
cl.addForce(info);
}
......@@ -1138,35 +1135,10 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
pmeGridIndexKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeGridIndexKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex.getDeviceBuffer());
if (!cl.getSupports64BitGlobalAtomics()) {
pmeGridIndexKernel.setArg<cl::Buffer>(10, pmeBsplineTheta.getDeviceBuffer());
pmeGridIndexKernel.setArg(11, OpenCLContext::ThreadBlockSize*PmeOrder*elementSize, NULL);
pmeGridIndexKernel.setArg<cl::Buffer>(12, charges.getDeviceBuffer());
pmeAtomRangeKernel = cl::Kernel(program, "findAtomRangeForGrid");
pmeZIndexKernel = cl::Kernel(program, "recordZIndex");
pmeAtomRangeKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex.getDeviceBuffer());
pmeAtomRangeKernel.setArg<cl::Buffer>(1, pmeAtomRange.getDeviceBuffer());
pmeAtomRangeKernel.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
pmeZIndexKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex.getDeviceBuffer());
pmeZIndexKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
}
pmeSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics())
pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid2.getDeviceBuffer());
else
pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid1.getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) {
pmeSpreadChargeKernel.setArg<cl::Buffer>(10, pmeAtomGridIndex.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(11, charges.getDeviceBuffer());
}
else if (deviceIsCpu)
pmeSpreadChargeKernel.setArg<cl::Buffer>(10, charges.getDeviceBuffer());
else {
pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeAtomGridIndex.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(3, pmeAtomRange.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(5, charges.getDeviceBuffer());
}
pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid2.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(10, pmeAtomGridIndex.getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(11, charges.getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(1, pmeBsplineModuliX.getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliY.getDeviceBuffer());
......@@ -1181,11 +1153,9 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid1.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(11, pmeAtomGridIndex.getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(12, charges.getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) {
pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid1.getDeviceBuffer());
}
pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid1.getDeviceBuffer());
if (usePmeQueue)
syncQueue->setKernel(cl::Kernel(program, "addEnergy"));
......@@ -1206,38 +1176,12 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeDispersionConvolutionKernel = cl::Kernel(program, "reciprocalConvolution");
pmeDispersionEvalEnergyKernel = cl::Kernel(program, "gridEvaluateEnergy");
pmeDispersionInterpolateForceKernel = cl::Kernel(program, "gridInterpolateForce");
int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
pmeDispersionGridIndexKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeDispersionGridIndexKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex.getDeviceBuffer());
if (!cl.getSupports64BitGlobalAtomics()) {
pmeDispersionGridIndexKernel.setArg<cl::Buffer>(10, pmeBsplineTheta.getDeviceBuffer());
pmeDispersionGridIndexKernel.setArg(11, OpenCLContext::ThreadBlockSize*PmeOrder*elementSize, NULL);
pmeDispersionGridIndexKernel.setArg<cl::Buffer>(12, sigmaEpsilon.getDeviceBuffer());
pmeDispersionAtomRangeKernel = cl::Kernel(program, "findAtomRangeForGrid");
pmeDispersionZIndexKernel = cl::Kernel(program, "recordZIndex");
pmeDispersionAtomRangeKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex.getDeviceBuffer());
pmeDispersionAtomRangeKernel.setArg<cl::Buffer>(1, pmeAtomRange.getDeviceBuffer());
pmeDispersionAtomRangeKernel.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
pmeDispersionZIndexKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex.getDeviceBuffer());
pmeDispersionZIndexKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
}
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics())
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid2.getDeviceBuffer());
else
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid1.getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) {
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(10, pmeAtomGridIndex.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(11, sigmaEpsilon.getDeviceBuffer());
}
else if (deviceIsCpu)
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(10, sigmaEpsilon.getDeviceBuffer());
else {
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(2, pmeAtomGridIndex.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(3, pmeAtomRange.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(4, pmeBsplineTheta.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(5, sigmaEpsilon.getDeviceBuffer());
}
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid2.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(10, pmeAtomGridIndex.getDeviceBuffer());
pmeDispersionSpreadChargeKernel.setArg<cl::Buffer>(11, sigmaEpsilon.getDeviceBuffer());
pmeDispersionConvolutionKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeDispersionConvolutionKernel.setArg<cl::Buffer>(1, pmeDispersionBsplineModuliX.getDeviceBuffer());
pmeDispersionConvolutionKernel.setArg<cl::Buffer>(2, pmeDispersionBsplineModuliY.getDeviceBuffer());
......@@ -1252,11 +1196,9 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid1.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(11, pmeAtomGridIndex.getDeviceBuffer());
pmeDispersionInterpolateForceKernel.setArg<cl::Buffer>(12, sigmaEpsilon.getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) {
pmeDispersionFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid1.getDeviceBuffer());
}
pmeDispersionFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid2.getDeviceBuffer());
pmeDispersionFinishSpreadChargeKernel.setArg<cl::Buffer>(1, pmeGrid1.getDeviceBuffer());
}
}
}
......@@ -1339,48 +1281,20 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeGridIndexKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[2]);
}
cl.executeKernel(pmeGridIndexKernel, cl.getNumAtoms());
if (deviceIsCpu && !cl.getSupports64BitGlobalAtomics()) {
setPeriodicBoxArgs(cl, pmeSpreadChargeKernel, 2);
if (cl.getUseDoublePrecision()) {
pmeSpreadChargeKernel.setArg<mm_double4>(7, recipBoxVectors[0]);
pmeSpreadChargeKernel.setArg<mm_double4>(8, recipBoxVectors[1]);
pmeSpreadChargeKernel.setArg<mm_double4>(9, recipBoxVectors[2]);
}
else {
pmeSpreadChargeKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[0]);
pmeSpreadChargeKernel.setArg<mm_float4>(8, recipBoxVectorsFloat[1]);
pmeSpreadChargeKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[2]);
}
cl.executeKernel(pmeSpreadChargeKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
sort->sort(pmeAtomGridIndex);
setPeriodicBoxArgs(cl, pmeSpreadChargeKernel, 2);
if (cl.getUseDoublePrecision()) {
pmeSpreadChargeKernel.setArg<mm_double4>(7, recipBoxVectors[0]);
pmeSpreadChargeKernel.setArg<mm_double4>(8, recipBoxVectors[1]);
pmeSpreadChargeKernel.setArg<mm_double4>(9, recipBoxVectors[2]);
}
else {
sort->sort(pmeAtomGridIndex);
if (cl.getSupports64BitGlobalAtomics()) {
setPeriodicBoxArgs(cl, pmeSpreadChargeKernel, 2);
if (cl.getUseDoublePrecision()) {
pmeSpreadChargeKernel.setArg<mm_double4>(7, recipBoxVectors[0]);
pmeSpreadChargeKernel.setArg<mm_double4>(8, recipBoxVectors[1]);
pmeSpreadChargeKernel.setArg<mm_double4>(9, recipBoxVectors[2]);
}
else {
pmeSpreadChargeKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[0]);
pmeSpreadChargeKernel.setArg<mm_float4>(8, recipBoxVectorsFloat[1]);
pmeSpreadChargeKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[2]);
}
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
cl.executeKernel(pmeFinishSpreadChargeKernel, gridSizeX*gridSizeY*gridSizeZ);
}
else {
cl.executeKernel(pmeAtomRangeKernel, cl.getNumAtoms());
setPeriodicBoxSizeArg(cl, pmeZIndexKernel, 2);
if (cl.getUseDoublePrecision())
pmeZIndexKernel.setArg<mm_double4>(3, recipBoxVectors[2]);
else
pmeZIndexKernel.setArg<mm_float4>(3, recipBoxVectorsFloat[2]);
cl.executeKernel(pmeZIndexKernel, cl.getNumAtoms());
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
}
pmeSpreadChargeKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[0]);
pmeSpreadChargeKernel.setArg<mm_float4>(8, recipBoxVectorsFloat[1]);
pmeSpreadChargeKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[2]);
}
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
cl.executeKernel(pmeFinishSpreadChargeKernel, gridSizeX*gridSizeY*gridSizeZ);
fft->execFFT(pmeGrid1, pmeGrid2, true);
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
if (cl.getUseDoublePrecision()) {
......@@ -1433,55 +1347,23 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeDispersionGridIndexKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[2]);
}
cl.executeKernel(pmeDispersionGridIndexKernel, cl.getNumAtoms());
if (deviceIsCpu && !cl.getSupports64BitGlobalAtomics()) {
cl.clearBuffer(pmeGrid1);
setPeriodicBoxArgs(cl, pmeDispersionSpreadChargeKernel, 2);
if (cl.getUseDoublePrecision()) {
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(7, recipBoxVectors[0]);
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(8, recipBoxVectors[1]);
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(9, recipBoxVectors[2]);
}
else {
pmeDispersionSpreadChargeKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[0]);
pmeDispersionSpreadChargeKernel.setArg<mm_float4>(8, recipBoxVectorsFloat[1]);
pmeDispersionSpreadChargeKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[2]);
}
cl.executeKernel(pmeDispersionSpreadChargeKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
if (!hasCoulomb)
sort->sort(pmeAtomGridIndex);
cl.clearBuffer(pmeGrid2);
setPeriodicBoxArgs(cl, pmeDispersionSpreadChargeKernel, 2);
if (cl.getUseDoublePrecision()) {
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(7, recipBoxVectors[0]);
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(8, recipBoxVectors[1]);
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(9, recipBoxVectors[2]);
}
else {
if (cl.getSupports64BitGlobalAtomics()) {
if (!hasCoulomb)
sort->sort(pmeAtomGridIndex);
cl.clearBuffer(pmeGrid2);
setPeriodicBoxArgs(cl, pmeDispersionSpreadChargeKernel, 2);
if (cl.getUseDoublePrecision()) {
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(7, recipBoxVectors[0]);
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(8, recipBoxVectors[1]);
pmeDispersionSpreadChargeKernel.setArg<mm_double4>(9, recipBoxVectors[2]);
}
else {
pmeDispersionSpreadChargeKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[0]);
pmeDispersionSpreadChargeKernel.setArg<mm_float4>(8, recipBoxVectorsFloat[1]);
pmeDispersionSpreadChargeKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[2]);
}
cl.executeKernel(pmeDispersionSpreadChargeKernel, cl.getNumAtoms());
cl.executeKernel(pmeDispersionFinishSpreadChargeKernel, gridSizeX*gridSizeY*gridSizeZ);
}
else {
sort->sort(pmeAtomGridIndex);
cl.clearBuffer(pmeGrid1);
cl.executeKernel(pmeDispersionAtomRangeKernel, cl.getNumAtoms());
setPeriodicBoxSizeArg(cl, pmeDispersionZIndexKernel, 2);
if (cl.getUseDoublePrecision())
pmeDispersionZIndexKernel.setArg<mm_double4>(3, recipBoxVectors[2]);
else
pmeDispersionZIndexKernel.setArg<mm_float4>(3, recipBoxVectorsFloat[2]);
cl.executeKernel(pmeDispersionZIndexKernel, cl.getNumAtoms());
cl.executeKernel(pmeDispersionSpreadChargeKernel, cl.getNumAtoms());
}
pmeDispersionSpreadChargeKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[0]);
pmeDispersionSpreadChargeKernel.setArg<mm_float4>(8, recipBoxVectorsFloat[1]);
pmeDispersionSpreadChargeKernel.setArg<mm_float4>(9, recipBoxVectorsFloat[2]);
}
cl.executeKernel(pmeDispersionSpreadChargeKernel, cl.getNumAtoms());
cl.executeKernel(pmeDispersionFinishSpreadChargeKernel, gridSizeX*gridSizeY*gridSizeZ);
dispersionFft->execFFT(pmeGrid1, pmeGrid2, true);
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
if (cl.getUseDoublePrecision()) {
pmeDispersionConvolutionKernel.setArg<mm_double4>(4, recipBoxVectors[0]);
pmeDispersionConvolutionKernel.setArg<mm_double4>(5, recipBoxVectors[1]);
......
......@@ -56,38 +56,21 @@ private:
};
OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
numForceBuffers(0), blockSorter(NULL), pinnedCountBuffer(NULL), pinnedCountMemory(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
blockSorter(NULL), pinnedCountBuffer(NULL), pinnedCountMemory(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
// Decide how many thread blocks and force buffers to use.
deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
if (deviceIsCpu) {
numForceThreadBlocks = context.getNumThreadBlocks();
forceThreadBlockSize = 1;
numForceBuffers = numForceThreadBlocks;
}
else if (context.getSIMDWidth() == 32) {
if (context.getSupports64BitGlobalAtomics()) {
numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
forceThreadBlockSize = 256;
// Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels.
numForceBuffers = 1;
}
else {
numForceThreadBlocks = 3*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
forceThreadBlockSize = 256;
numForceBuffers = numForceThreadBlocks*forceThreadBlockSize/OpenCLContext::TileSize;
}
}
else {
numForceThreadBlocks = context.getNumThreadBlocks();
forceThreadBlockSize = (context.getSIMDWidth() >= 32 ? OpenCLContext::ThreadBlockSize : 32);
if (context.getSupports64BitGlobalAtomics()) {
// Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels.
numForceBuffers = 1;
}
else {
numForceBuffers = numForceThreadBlocks*forceThreadBlockSize/OpenCLContext::TileSize;
}
}
pinnedCountBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, sizeof(unsigned int));
pinnedCountMemory = (unsigned int*) context.getQueue().enqueueMapBuffer(*pinnedCountBuffer, CL_TRUE, CL_MAP_READ, 0, sizeof(int));
......@@ -724,10 +707,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
// Set arguments to the Kernel.
int index = 0;
if (context.getSupports64BitGlobalAtomics())
kernel.setArg<cl::Memory>(index++, context.getLongForceBuffer().getDeviceBuffer());
else
kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
kernel.setArg<cl::Memory>(index++, context.getLongForceBuffer().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusions.getDeviceBuffer());
......
......@@ -4,8 +4,24 @@
*/
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#ifdef SUPPORTS_64_BIT_ATOMICS
#ifdef cl_khr_int64_base_atomics
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#else
void atom_add(volatile __global unsigned long* p, long unsigned val) {
volatile __global unsigned int* word = (volatile __global unsigned int*) p;
#ifdef __ENDIAN_LITTLE__
int lowIndex = 0;
#else
int lowIndex = 1;
#endif
unsigned int lower = val;
unsigned int upper = val >> 32;
unsigned int result = atomic_add(&word[lowIndex], lower);
int carry = (lower + (unsigned long) result >= 0x100000000 ? 1 : 0);
upper += carry;
if (upper != 0)
atomic_add(&word[1-lowIndex], upper);
}
#endif
#define KERNEL __kernel
......
......@@ -235,7 +235,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
unsigned int tilesToStore = neighborsInBuffer/TILE_SIZE;
if (indexInWarp == 0)
*tileStartIndex = atom_add(interactionCount, tilesToStore);
*tileStartIndex = ATOMIC_ADD(interactionCount, tilesToStore);
SYNC_WARPS;
unsigned int newTileStartIndex = *tileStartIndex;
if (newTileStartIndex+tilesToStore <= maxTiles) {
......@@ -260,7 +260,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
if (neighborsInBuffer > 0) {
unsigned int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
if (indexInWarp == 0)
*tileStartIndex = atom_add(interactionCount, tilesToStore);
*tileStartIndex = ATOMIC_ADD(interactionCount, tilesToStore);
SYNC_WARPS;
unsigned int newTileStartIndex = *tileStartIndex;
if (newTileStartIndex+tilesToStore <= maxTiles) {
......@@ -406,7 +406,7 @@ void storeInteractionData(int x, __local int* buffer, __local int* sum, __local
int tilesToStore = (storePartialTile ? (atomsToStore+TILE_SIZE-1)/TILE_SIZE : atomsToStore/TILE_SIZE);
if (tilesToStore > 0) {
if (get_local_id(0) == 0)
*baseIndex = atom_add(interactionCount, tilesToStore);
*baseIndex = ATOMIC_ADD(interactionCount, tilesToStore);
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) == 0)
*numAtoms = atomsToStore-tilesToStore*TILE_SIZE;
......@@ -432,7 +432,7 @@ void storeInteractionData(int x, __local int* buffer, __local int* sum, __local
// previous call to this function. Save them now.
if (get_local_id(0) == 0)
*baseIndex = atom_add(interactionCount, 1);
*baseIndex = ATOMIC_ADD(interactionCount, 1);
barrier(CLK_LOCAL_MEM_FENCE);
if (*baseIndex < maxTiles) {
if (get_local_id(0) == 0)
......
......@@ -125,7 +125,7 @@ void storeInteractionData(int x, int* buffer, int* atoms, int* numAtoms, int num
// The atoms buffer is full, so store it to global memory.
int tilesToStore = BUFFER_SIZE/TILE_SIZE;
int baseIndex = atom_add(interactionCount, tilesToStore);
int baseIndex = ATOMIC_ADD(interactionCount, tilesToStore);
if (baseIndex+tilesToStore <= maxTiles) {
for (int i = 0; i < tilesToStore; i++) {
interactingTiles[baseIndex+i] = x;
......@@ -142,7 +142,7 @@ void storeInteractionData(int x, int* buffer, int* atoms, int* numAtoms, int num
// There are some leftover atoms, so save them now.
int tilesToStore = (*numAtoms+TILE_SIZE-1)/TILE_SIZE;
int baseIndex = atom_add(interactionCount, tilesToStore);
int baseIndex = ATOMIC_ADD(interactionCount, tilesToStore);
if (baseIndex+tilesToStore <= maxTiles) {
for (int i = 0; i < tilesToStore; i++) {
interactingTiles[baseIndex+i] = x;
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
typedef struct {
......@@ -17,11 +14,7 @@ typedef struct {
* Compute nonbonded interactions.
*/
__kernel void computeNonbonded(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers,
#else
__global real4* restrict forceBuffers,
#endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const int2* restrict exclusionTiles, unsigned int startTileIndex, unsigned long numTileIndices
#ifdef USE_CUTOFF
......@@ -176,24 +169,16 @@ __kernel void computeNonbonded(
// Write results.
#ifdef INCLUDE_FORCES
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], realToFixedPoint(force.x));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
if (x != y) {
offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], realToFixedPoint(localData[get_local_id(0)].fx));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], realToFixedPoint(localData[get_local_id(0)].fy));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[get_local_id(0)].fz));
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(localData[get_local_id(0)].fx));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[get_local_id(0)].fy));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[get_local_id(0)].fz));
}
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
if (x != y)
forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
#endif
#endif
}
......@@ -409,22 +394,14 @@ __kernel void computeNonbonded(
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], realToFixedPoint(force.x));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
if (atom2 < PADDED_NUM_ATOMS) {
atom_add(&forceBuffers[atom2], realToFixedPoint(localData[get_local_id(0)].fx));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], realToFixedPoint(localData[get_local_id(0)].fy));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[get_local_id(0)].fz));
ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) realToFixedPoint(localData[get_local_id(0)].fx));
ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[get_local_id(0)].fy));
ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[get_local_id(0)].fz));
}
#else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
if (atom2 < PADDED_NUM_ATOMS)
forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
#endif
#endif
}
pos++;
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
typedef struct {
real x, y, z;
real q;
......@@ -107,9 +103,9 @@ __kernel void computeNonbonded(
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], realToFixedPoint(force.x));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
......@@ -183,9 +179,9 @@ __kernel void computeNonbonded(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], realToFixedPoint(force.x));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
......@@ -197,9 +193,9 @@ __kernel void computeNonbonded(
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], realToFixedPoint(localData[tgx].fx));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fy));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fz));
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(localData[tgx].fx));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fy));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fz));
#else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset];
......@@ -342,9 +338,9 @@ __kernel void computeNonbonded(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], realToFixedPoint(force.x));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
......@@ -409,9 +405,9 @@ __kernel void computeNonbonded(
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], realToFixedPoint(force.x));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], realToFixedPoint(force.y));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], realToFixedPoint(force.z));
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
......@@ -429,9 +425,9 @@ __kernel void computeNonbonded(
#endif
if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom2], realToFixedPoint(localData[tgx].fx));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fy));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], realToFixedPoint(localData[tgx].fz));
ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) realToFixedPoint(localData[tgx].fx));
ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fy));
ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fz));
#else
unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset];
......
......@@ -218,8 +218,6 @@ CommonCalcAmoebaMultipoleForceKernel::~CommonCalcAmoebaMultipoleForceKernel() {
void CommonCalcAmoebaMultipoleForceKernel::initialize(const System& system, const AmoebaMultipoleForce& force) {
ContextSelector selector(cc);
if (!cc.getSupports64BitGlobalAtomics())
throw OpenMMException("AmoebaMultipoleForce requires a device that supports 64 bit atomic operations");
// Initialize multipole parameters.
......@@ -2367,8 +2365,6 @@ CommonCalcHippoNonbondedForceKernel::CommonCalcHippoNonbondedForceKernel(const s
void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const HippoNonbondedForce& force) {
ContextSelector selector(cc);
if (!cc.getSupports64BitGlobalAtomics())
throw OpenMMException("HippoNonbondedForce requires a device that supports 64 bit atomic operations");
extrapolationCoefficients = force.getExtrapolationCoefficients();
usePME = (force.getNonbondedMethod() == HippoNonbondedForce::PME);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment