Commit 7943a339 authored by Peter Eastman's avatar Peter Eastman
Browse files

Restructured the use of force buffers in a new way that hopefully really works everywhere.

parent 13ef0ee8
...@@ -1738,6 +1738,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1738,6 +1738,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
defines["NUM_ATOMS"] = intToString(cl.getNumAtoms()); defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks()); defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks());
if (cl.getSIMDWidth() == 32)
defines["WARPS_PER_GROUP"] = OpenCLExpressionUtilities::intToString(cl.getNonbondedUtilities().getForceThreadBlockSize()/OpenCLContext::TileSize);
string file; string file;
if (deviceIsCpu) if (deviceIsCpu)
file = OpenCLKernelSources::gbsaObc_cpu; file = OpenCLKernelSources::gbsaObc_cpu;
...@@ -1753,7 +1755,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1753,7 +1755,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel.setArg<cl::Buffer>(index++, params->getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, params->getDeviceBuffer());
computeBornSumKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*13*sizeof(cl_float), NULL); computeBornSumKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*13*sizeof(cl_float), NULL);
computeBornSumKernel.setArg(index++, (deviceIsCpu ? 1 : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL); computeBornSumKernel.setArg(index++, (deviceIsCpu ? 1 : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
...@@ -1773,7 +1774,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1773,7 +1774,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel.setArg<cl::Buffer>(index++, bornForce->getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, bornForce->getDeviceBuffer());
force1Kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*13*sizeof(cl_float), NULL); force1Kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*13*sizeof(cl_float), NULL);
force1Kernel.setArg(index++, (deviceIsCpu ? 1 : nb.getForceThreadBlockSize())*sizeof(mm_float4), NULL); force1Kernel.setArg(index++, (deviceIsCpu ? 1 : nb.getForceThreadBlockSize())*sizeof(mm_float4), NULL);
force1Kernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
...@@ -1805,14 +1805,14 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1805,14 +1805,14 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornForceKernel.setArg<cl::Buffer>(6, obcChain->getDeviceBuffer()); reduceBornForceKernel.setArg<cl::Buffer>(6, obcChain->getDeviceBuffer());
} }
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<mm_float4>(8, cl.getPeriodicBoxSize()); computeBornSumKernel.setArg<mm_float4>(7, cl.getPeriodicBoxSize());
computeBornSumKernel.setArg<mm_float4>(9, cl.getInvPeriodicBoxSize()); computeBornSumKernel.setArg<mm_float4>(8, cl.getInvPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize()); force1Kernel.setArg<mm_float4>(9, cl.getPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize()); force1Kernel.setArg<mm_float4>(10, cl.getInvPeriodicBoxSize());
if (maxTiles < nb.getInteractingTiles().getSize()) { if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize(); maxTiles = nb.getInteractingTiles().getSize();
computeBornSumKernel.setArg<cl_uint>(10, maxTiles); computeBornSumKernel.setArg<cl_uint>(10, maxTiles);
force1Kernel.setArg<cl_uint>(12, maxTiles); force1Kernel.setArg<cl_uint>(11, maxTiles);
} }
} }
cl.executeKernel(computeBornSumKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize()); cl.executeKernel(computeBornSumKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
...@@ -2148,7 +2148,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -2148,7 +2148,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
} }
map<string, string> replacements; map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = n2EnergySource.str(); replacements["COMPUTE_INTERACTION"] = n2EnergySource.str();
stringstream extraArgs, loadLocal1, loadLocal2, clearLocal, load1, load2, recordDeriv, storeDerivs1, storeDerivs2, declareTemps, setTemps; stringstream extraArgs, loadLocal1, loadLocal2, clearLocal, load1, load2, declare1, recordDeriv, storeDerivs1, storeDerivs2, declareTemps, setTemps;
if (force.getNumGlobalParameters() > 0) if (force.getNumGlobalParameters() > 0)
extraArgs << ", __constant float* globals"; extraArgs << ", __constant float* globals";
for (int i = 0; i < (int) params->getBuffers().size(); i++) { for (int i = 0; i < (int) params->getBuffers().size(); i++) {
...@@ -2174,7 +2174,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -2174,7 +2174,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
string index = intToString(i+1); string index = intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* derivBuffers" << index << ", __local " << buffer.getType() << "* local_deriv" << index; extraArgs << ", __global " << buffer.getType() << "* derivBuffers" << index << ", __local " << buffer.getType() << "* local_deriv" << index;
clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n"; clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n";
load1 << buffer.getType() << " deriv" << index << "_1 = 0.0f;\n"; declare1 << buffer.getType() << " deriv" << index << "_1 = 0.0f;\n";
load2 << buffer.getType() << " deriv" << index << "_2 = 0.0f;\n"; load2 << buffer.getType() << " deriv" << index << "_2 = 0.0f;\n";
recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n"; recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n";
storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")"; storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")";
...@@ -2188,6 +2188,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -2188,6 +2188,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
replacements["CLEAR_LOCAL_DERIVATIVES"] = clearLocal.str(); replacements["CLEAR_LOCAL_DERIVATIVES"] = clearLocal.str();
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str(); replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
replacements["LOAD_ATOM2_PARAMETERS"] = load2.str(); replacements["LOAD_ATOM2_PARAMETERS"] = load2.str();
replacements["DECLARE_ATOM1_DERIVATIVES"] = declare1.str();
replacements["RECORD_DERIVATIVE_2"] = recordDeriv.str(); replacements["RECORD_DERIVATIVE_2"] = recordDeriv.str();
replacements["STORE_DERIVATIVES_1"] = storeDerivs1.str(); replacements["STORE_DERIVATIVES_1"] = storeDerivs1.str();
replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str(); replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str();
...@@ -2482,7 +2483,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2482,7 +2483,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairValueKernel.setArg<cl::Buffer>(index++, valueBuffers->getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, valueBuffers->getDeviceBuffer());
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL); pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL); pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
pairValueKernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
...@@ -2531,7 +2531,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2531,7 +2531,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL); pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL);
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
...@@ -2609,14 +2608,14 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2609,14 +2608,14 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
globals->upload(globalParamValues); globals->upload(globalParamValues);
} }
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
pairValueKernel.setArg<mm_float4>(11, cl.getPeriodicBoxSize()); pairValueKernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize());
pairValueKernel.setArg<mm_float4>(12, cl.getInvPeriodicBoxSize()); pairValueKernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(12, cl.getPeriodicBoxSize()); pairEnergyKernel.setArg<mm_float4>(11, cl.getPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(13, cl.getInvPeriodicBoxSize()); pairEnergyKernel.setArg<mm_float4>(12, cl.getInvPeriodicBoxSize());
if (maxTiles < nb.getInteractingTiles().getSize()) { if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize(); maxTiles = nb.getInteractingTiles().getSize();
pairValueKernel.setArg<cl_uint>(13, maxTiles); pairValueKernel.setArg<cl_uint>(12, maxTiles);
pairEnergyKernel.setArg<cl_uint>(14, maxTiles); pairEnergyKernel.setArg<cl_uint>(13, maxTiles);
} }
} }
cl.executeKernel(pairValueKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize()); cl.executeKernel(pairValueKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
......
...@@ -37,7 +37,7 @@ using namespace std; ...@@ -37,7 +37,7 @@ using namespace std;
OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false), OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false),
numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL), numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), forceBufferFlags(NULL) { interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL) {
// Decide how many thread blocks and force buffers to use. // Decide how many thread blocks and force buffers to use.
deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU); deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
...@@ -48,8 +48,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con ...@@ -48,8 +48,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
numForceBuffers = numForceThreadBlocks; numForceBuffers = numForceThreadBlocks;
} }
else if (context.getSIMDWidth() == 32) { else if (context.getSIMDWidth() == 32) {
numForceThreadBlocks = 2*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
forceThreadBlockSize = 256; forceThreadBlockSize = 128;
numForceBuffers = numForceThreadBlocks; numForceBuffers = numForceThreadBlocks;
} }
else { else {
...@@ -82,8 +82,6 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() { ...@@ -82,8 +82,6 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
delete blockCenter; delete blockCenter;
if (blockBoundingBox != NULL) if (blockBoundingBox != NULL)
delete blockBoundingBox; delete blockBoundingBox;
if (forceBufferFlags != NULL)
delete forceBufferFlags;
} }
void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel) { void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel) {
...@@ -239,12 +237,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -239,12 +237,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
interactionCount->upload(); interactionCount->upload();
} }
// Create the flags for reserving force buffers.
forceBufferFlags = new OpenCLArray<cl_uint>(context, numAtomBlocks*numForceThreadBlocks, "forceBufferFlags", false);
vector<cl_uint> forceBufferFlagsVec(forceBufferFlags->getSize(), 0);
forceBufferFlags->upload(forceBufferFlagsVec);
// Create kernels. // Create kernels.
forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true); forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
...@@ -320,8 +312,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() { ...@@ -320,8 +312,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void OpenCLNonbondedUtilities::computeInteractions() { void OpenCLNonbondedUtilities::computeInteractions() {
if (cutoff != -1.0) { if (cutoff != -1.0) {
if (useCutoff) { if (useCutoff) {
forceKernel.setArg<mm_float4>(13, context.getPeriodicBoxSize()); forceKernel.setArg<mm_float4>(12, context.getPeriodicBoxSize());
forceKernel.setArg<mm_float4>(14, context.getInvPeriodicBoxSize()); forceKernel.setArg<mm_float4>(13, context.getInvPeriodicBoxSize());
} }
context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
} }
...@@ -343,14 +335,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() { ...@@ -343,14 +335,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
newSize = numTiles; newSize = numTiles;
delete interactingTiles; delete interactingTiles;
interactingTiles = new OpenCLArray<mm_ushort2>(context, newSize, "interactingTiles"); interactingTiles = new OpenCLArray<mm_ushort2>(context, newSize, "interactingTiles");
forceKernel.setArg<cl::Buffer>(11, interactingTiles->getDeviceBuffer()); forceKernel.setArg<cl::Buffer>(10, interactingTiles->getDeviceBuffer());
forceKernel.setArg<cl_uint>(15, newSize); forceKernel.setArg<cl_uint>(14, newSize);
findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(9, newSize); findInteractingBlocksKernel.setArg<cl_uint>(9, newSize);
if (context.getSIMDWidth() == 32 || deviceIsCpu) { if (context.getSIMDWidth() == 32 || deviceIsCpu) {
delete interactionFlags; delete interactionFlags;
interactionFlags = new OpenCLArray<cl_uint>(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags"); interactionFlags = new OpenCLArray<cl_uint>(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags");
forceKernel.setArg<cl::Buffer>(16, interactionFlags->getDeviceBuffer()); forceKernel.setArg<cl::Buffer>(15, interactionFlags->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
...@@ -503,7 +495,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -503,7 +495,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel.setArg(index++, 4*forceThreadBlockSize*sizeof(cl_float), NULL); kernel.setArg(index++, 4*forceThreadBlockSize*sizeof(cl_float), NULL);
kernel.setArg<cl_uint>(index++, startTileIndex); kernel.setArg<cl_uint>(index++, startTileIndex);
kernel.setArg<cl_uint>(index++, startTileIndex+numTiles); kernel.setArg<cl_uint>(index++, startTileIndex+numTiles);
kernel.setArg<cl::Buffer>(index++, forceBufferFlags->getDeviceBuffer());
if (useCutoff) { if (useCutoff) {
kernel.setArg<cl::Buffer>(index++, interactingTiles->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, interactingTiles->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactionCount->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, interactionCount->getDeviceBuffer());
......
...@@ -196,12 +196,6 @@ public: ...@@ -196,12 +196,6 @@ public:
OpenCLArray<cl_uint>& getExclusionRowIndices() { OpenCLArray<cl_uint>& getExclusionRowIndices() {
return *exclusionRowIndices; return *exclusionRowIndices;
} }
/**
* Get the array which contains flags for reserving force buffers.
*/
OpenCLArray<cl_uint>& getForceBufferFlags() {
return *forceBufferFlags;
}
/** /**
* Get the index of the first tile this context is responsible for processing. * Get the index of the first tile this context is responsible for processing.
*/ */
...@@ -245,7 +239,6 @@ private: ...@@ -245,7 +239,6 @@ private:
OpenCLArray<cl_uint>* interactionCount; OpenCLArray<cl_uint>* interactionCount;
OpenCLArray<mm_float4>* blockCenter; OpenCLArray<mm_float4>* blockCenter;
OpenCLArray<mm_float4>* blockBoundingBox; OpenCLArray<mm_float4>* blockBoundingBox;
OpenCLArray<cl_uint>* forceBufferFlags;
std::vector<std::vector<int> > atomExclusions; std::vector<std::vector<int> > atomExclusions;
std::vector<ParameterInfo> parameters; std::vector<ParameterInfo> parameters;
std::vector<ParameterInfo> arguments; std::vector<ParameterInfo> arguments;
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
__kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force, __kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices, __global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempBuffer, __global unsigned int* forceBufferFlags, __global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else #else
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force, void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices, __global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer, __global unsigned int* forceBufferFlags, __global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
#else #else
......
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#define TILE_SIZE 32 #define TILE_SIZE 32
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset1] += deriv##INDEX##_1; #define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset2] += local_deriv##INDEX[get_local_id(0)]; #define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[get_local_id(0)];
/**
* Mark that a block in the force buffer is in use.
*/
void reserveBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
while (atom_cmpxchg(&forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)], 0, 1) != 0)
;
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
/**
* Mark that a block in the force buffer is no longer in use.
*/
void releaseBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
mem_fence(CLK_GLOBAL_MEM_FENCE);
if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)] = 0;
}
/** /**
* Compute a force based on pair interactions. * Compute a force based on pair interactions.
*/ */
__kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force, __kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices, __global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempBuffer, __global unsigned int* forceBufferFlags, __global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else #else
...@@ -48,132 +29,67 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene ...@@ -48,132 +29,67 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2*WARPS_PER_GROUP]; __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP]; __local int exclusionIndex[WARPS_PER_GROUP];
__local int2* reservedBlocks = (__local int2*) exclusionRange;
while (pos < end) {
do {
// Extract the coordinates of this tile // Extract the coordinates of this tile
unsigned int x, y;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx; const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE; const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int x, y;
float4 force = 0.0f; float4 force = 0.0f;
float4 posq1 = posq[atom1]; DECLARE_ATOM1_DERIVATIVES
LOAD_ATOM1_PARAMETERS if (pos < end) {
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (numTiles <= maxTiles) {
#endif ushort2 tileIndices = tiles[pos];
float r = SQRT(r2); x = tileIndices.x;
LOAD_ATOM2_PARAMETERS y = tileIndices.y;
atom2 = y*TILE_SIZE+j; }
float dEdR = 0.0f; else
float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif #endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
} }
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Write results // Locate the exclusion data for this tile.
reserveBuffer(x, forceBufferFlags);
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
STORE_DERIVATIVES_1
releaseBuffer(x, forceBufferFlags);
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0); #ifdef USE_EXCLUSIONS
if (lasty != y) { if (tgx < 2)
unsigned int j = y*TILE_SIZE + tgx; exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
local_posq[localAtomIndex] = posq[j]; if (tgx == 0)
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL exclusionIndex[localGroupIndex] = -1;
} for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
local_force[localAtomIndex] = 0.0f; if (exclusionIndices[i] == y)
CLEAR_LOCAL_DERIVATIVES exclusionIndex[localGroupIndex] = i*TILE_SIZE;
#ifdef USE_CUTOFF bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF); #else
if (!hasExclusions && flags == 0) { bool hasExclusions = false;
// No interactions in this tile.
}
else
#endif #endif
{ if (pos >= end)
// Compute the full set of interactions in this tile. ; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF); unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif #endif
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = tbx+tj; int atom2 = tbx+j;
float4 posq2 = local_posq[atom2]; float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -187,44 +103,149 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene ...@@ -187,44 +103,149 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
#endif #endif
float r = SQRT(r2); float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj; atom2 = y*TILE_SIZE+j;
float dEdR = 0.0f; float dEdR = 0.0f;
float tempEnergy = 0.0f; float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
} }
energy += tempEnergy; energy += 0.5f*tempEnergy;
delta.xyz *= dEdR; delta.xyz *= dEdR;
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
atom2 = tbx+tj;
local_force[atom2].xyz += delta.xyz;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
excl >>= 1; excl >>= 1;
#endif #endif
tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
else {
// This is an off-diagonal tile.
// Write results const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_force[localAtomIndex] = 0.0f;
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags == 0) {
// No interactions in this tile.
}
else
#endif
{
// Compute the full set of interactions in this tile.
reserveBuffer(x, forceBufferFlags); #ifdef USE_EXCLUSIONS
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
forceBuffers[offset1].xyz += force.xyz; excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
STORE_DERIVATIVES_1 #endif
releaseBuffer(x, forceBufferFlags); unsigned int tj = tgx;
reserveBuffer(y, forceBufferFlags); for (unsigned int j = 0; j < TILE_SIZE; j++) {
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; #ifdef USE_EXCLUSIONS
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz; bool isExcluded = !(excl & 0x1);
STORE_DERIVATIVES_2 #endif
releaseBuffer(y, forceBufferFlags); int atom2 = tbx+tj;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
atom2 = tbx+tj;
local_force[atom2].xyz += delta.xyz;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
} }
lasty = y; lasty = y;
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
bool done = false;
int doneIndex = 0;
int checkIndex = 0;
while (true) {
// See if any warp still needs to write its data.
bool allDone = true;
barrier(CLK_LOCAL_MEM_FENCE);
while (doneIndex < WARPS_PER_GROUP && allDone) {
if (reservedBlocks[doneIndex].x != -1)
allDone = false;
else
doneIndex++;
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
bool canWrite = (writeX != -1);
while (checkIndex < localGroupIndex && canWrite) {
if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
canWrite = false;
else
checkIndex++;
}
if (canWrite) {
// Write the data to global memory, then mark this warp as done.
if (writeX > -1) {
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1
}
if (writeY > -1) {
const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += local_force[get_local_id(0)].xyz;
STORE_DERIVATIVES_2
}
done = true;
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(-1, -1);
}
}
}
pos++; pos++;
} } while (pos < end);
energyBuffer[get_global_id(0)] += energy; energyBuffer[get_global_id(0)] += energy;
} }
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
__kernel void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __kernel void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer, __global unsigned int* forceBufferFlags, __local float* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else #else
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer, __global unsigned int* forceBufferFlags, __local float* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
#else #else
......
...@@ -15,7 +15,7 @@ typedef struct { ...@@ -15,7 +15,7 @@ typedef struct {
*/ */
__kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
__local AtomData* localData, __local float* tempBuffer, __global unsigned int* forceBufferFlags, __local AtomData* localData, __local float* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) { __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
#else #else
...@@ -192,7 +192,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po ...@@ -192,7 +192,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer, __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce, __global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__local AtomData* localData, __local float4* tempBuffer, __global unsigned int* forceBufferFlags, __local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) { __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
#else #else
......
...@@ -16,7 +16,7 @@ typedef struct { ...@@ -16,7 +16,7 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
__local AtomData* localData, __local float* tempBuffer, __global unsigned int* forceBufferFlags, __local AtomData* localData, __local float* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) { __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else #else
...@@ -203,7 +203,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -203,7 +203,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer, void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce, __global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__local AtomData* localData, __local float4* tempBuffer, __global unsigned int* forceBufferFlags, __local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) { __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else #else
......
...@@ -13,7 +13,7 @@ typedef struct { ...@@ -13,7 +13,7 @@ typedef struct {
__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, __kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex, __global unsigned int* forceBufferFlags, unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else #else
......
...@@ -14,7 +14,7 @@ typedef struct { ...@@ -14,7 +14,7 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex, __global unsigned int* forceBufferFlags, unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else #else
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment