Commit 7943a339 authored by Peter Eastman's avatar Peter Eastman
Browse files

Restructured the use of force buffers in a new way that hopefully really works everywhere.

parent 13ef0ee8
...@@ -1738,6 +1738,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1738,6 +1738,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
defines["NUM_ATOMS"] = intToString(cl.getNumAtoms()); defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks()); defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks());
if (cl.getSIMDWidth() == 32)
defines["WARPS_PER_GROUP"] = OpenCLExpressionUtilities::intToString(cl.getNonbondedUtilities().getForceThreadBlockSize()/OpenCLContext::TileSize);
string file; string file;
if (deviceIsCpu) if (deviceIsCpu)
file = OpenCLKernelSources::gbsaObc_cpu; file = OpenCLKernelSources::gbsaObc_cpu;
...@@ -1753,7 +1755,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1753,7 +1755,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel.setArg<cl::Buffer>(index++, params->getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, params->getDeviceBuffer());
computeBornSumKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*13*sizeof(cl_float), NULL); computeBornSumKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*13*sizeof(cl_float), NULL);
computeBornSumKernel.setArg(index++, (deviceIsCpu ? 1 : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL); computeBornSumKernel.setArg(index++, (deviceIsCpu ? 1 : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
...@@ -1773,7 +1774,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1773,7 +1774,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel.setArg<cl::Buffer>(index++, bornForce->getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, bornForce->getDeviceBuffer());
force1Kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*13*sizeof(cl_float), NULL); force1Kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*13*sizeof(cl_float), NULL);
force1Kernel.setArg(index++, (deviceIsCpu ? 1 : nb.getForceThreadBlockSize())*sizeof(mm_float4), NULL); force1Kernel.setArg(index++, (deviceIsCpu ? 1 : nb.getForceThreadBlockSize())*sizeof(mm_float4), NULL);
force1Kernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
...@@ -1805,14 +1805,14 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1805,14 +1805,14 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornForceKernel.setArg<cl::Buffer>(6, obcChain->getDeviceBuffer()); reduceBornForceKernel.setArg<cl::Buffer>(6, obcChain->getDeviceBuffer());
} }
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<mm_float4>(8, cl.getPeriodicBoxSize()); computeBornSumKernel.setArg<mm_float4>(7, cl.getPeriodicBoxSize());
computeBornSumKernel.setArg<mm_float4>(9, cl.getInvPeriodicBoxSize()); computeBornSumKernel.setArg<mm_float4>(8, cl.getInvPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize()); force1Kernel.setArg<mm_float4>(9, cl.getPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize()); force1Kernel.setArg<mm_float4>(10, cl.getInvPeriodicBoxSize());
if (maxTiles < nb.getInteractingTiles().getSize()) { if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize(); maxTiles = nb.getInteractingTiles().getSize();
computeBornSumKernel.setArg<cl_uint>(10, maxTiles); computeBornSumKernel.setArg<cl_uint>(10, maxTiles);
force1Kernel.setArg<cl_uint>(12, maxTiles); force1Kernel.setArg<cl_uint>(11, maxTiles);
} }
} }
cl.executeKernel(computeBornSumKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize()); cl.executeKernel(computeBornSumKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
...@@ -2148,7 +2148,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -2148,7 +2148,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
} }
map<string, string> replacements; map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = n2EnergySource.str(); replacements["COMPUTE_INTERACTION"] = n2EnergySource.str();
stringstream extraArgs, loadLocal1, loadLocal2, clearLocal, load1, load2, recordDeriv, storeDerivs1, storeDerivs2, declareTemps, setTemps; stringstream extraArgs, loadLocal1, loadLocal2, clearLocal, load1, load2, declare1, recordDeriv, storeDerivs1, storeDerivs2, declareTemps, setTemps;
if (force.getNumGlobalParameters() > 0) if (force.getNumGlobalParameters() > 0)
extraArgs << ", __constant float* globals"; extraArgs << ", __constant float* globals";
for (int i = 0; i < (int) params->getBuffers().size(); i++) { for (int i = 0; i < (int) params->getBuffers().size(); i++) {
...@@ -2174,7 +2174,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -2174,7 +2174,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
string index = intToString(i+1); string index = intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* derivBuffers" << index << ", __local " << buffer.getType() << "* local_deriv" << index; extraArgs << ", __global " << buffer.getType() << "* derivBuffers" << index << ", __local " << buffer.getType() << "* local_deriv" << index;
clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n"; clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n";
load1 << buffer.getType() << " deriv" << index << "_1 = 0.0f;\n"; declare1 << buffer.getType() << " deriv" << index << "_1 = 0.0f;\n";
load2 << buffer.getType() << " deriv" << index << "_2 = 0.0f;\n"; load2 << buffer.getType() << " deriv" << index << "_2 = 0.0f;\n";
recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n"; recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n";
storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")"; storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")";
...@@ -2188,6 +2188,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -2188,6 +2188,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
replacements["CLEAR_LOCAL_DERIVATIVES"] = clearLocal.str(); replacements["CLEAR_LOCAL_DERIVATIVES"] = clearLocal.str();
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str(); replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
replacements["LOAD_ATOM2_PARAMETERS"] = load2.str(); replacements["LOAD_ATOM2_PARAMETERS"] = load2.str();
replacements["DECLARE_ATOM1_DERIVATIVES"] = declare1.str();
replacements["RECORD_DERIVATIVE_2"] = recordDeriv.str(); replacements["RECORD_DERIVATIVE_2"] = recordDeriv.str();
replacements["STORE_DERIVATIVES_1"] = storeDerivs1.str(); replacements["STORE_DERIVATIVES_1"] = storeDerivs1.str();
replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str(); replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str();
...@@ -2482,7 +2483,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2482,7 +2483,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairValueKernel.setArg<cl::Buffer>(index++, valueBuffers->getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, valueBuffers->getDeviceBuffer());
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL); pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL); pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
pairValueKernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
...@@ -2531,7 +2531,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2531,7 +2531,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL); pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL);
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
...@@ -2609,14 +2608,14 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -2609,14 +2608,14 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
globals->upload(globalParamValues); globals->upload(globalParamValues);
} }
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
pairValueKernel.setArg<mm_float4>(11, cl.getPeriodicBoxSize()); pairValueKernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize());
pairValueKernel.setArg<mm_float4>(12, cl.getInvPeriodicBoxSize()); pairValueKernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(12, cl.getPeriodicBoxSize()); pairEnergyKernel.setArg<mm_float4>(11, cl.getPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(13, cl.getInvPeriodicBoxSize()); pairEnergyKernel.setArg<mm_float4>(12, cl.getInvPeriodicBoxSize());
if (maxTiles < nb.getInteractingTiles().getSize()) { if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize(); maxTiles = nb.getInteractingTiles().getSize();
pairValueKernel.setArg<cl_uint>(13, maxTiles); pairValueKernel.setArg<cl_uint>(12, maxTiles);
pairEnergyKernel.setArg<cl_uint>(14, maxTiles); pairEnergyKernel.setArg<cl_uint>(13, maxTiles);
} }
} }
cl.executeKernel(pairValueKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize()); cl.executeKernel(pairValueKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
......
...@@ -37,7 +37,7 @@ using namespace std; ...@@ -37,7 +37,7 @@ using namespace std;
OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false), OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false),
numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL), numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), forceBufferFlags(NULL) { interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL) {
// Decide how many thread blocks and force buffers to use. // Decide how many thread blocks and force buffers to use.
deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU); deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
...@@ -48,8 +48,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con ...@@ -48,8 +48,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
numForceBuffers = numForceThreadBlocks; numForceBuffers = numForceThreadBlocks;
} }
else if (context.getSIMDWidth() == 32) { else if (context.getSIMDWidth() == 32) {
numForceThreadBlocks = 2*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
forceThreadBlockSize = 256; forceThreadBlockSize = 128;
numForceBuffers = numForceThreadBlocks; numForceBuffers = numForceThreadBlocks;
} }
else { else {
...@@ -82,8 +82,6 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() { ...@@ -82,8 +82,6 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
delete blockCenter; delete blockCenter;
if (blockBoundingBox != NULL) if (blockBoundingBox != NULL)
delete blockBoundingBox; delete blockBoundingBox;
if (forceBufferFlags != NULL)
delete forceBufferFlags;
} }
void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel) { void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel) {
...@@ -239,12 +237,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -239,12 +237,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
interactionCount->upload(); interactionCount->upload();
} }
// Create the flags for reserving force buffers.
forceBufferFlags = new OpenCLArray<cl_uint>(context, numAtomBlocks*numForceThreadBlocks, "forceBufferFlags", false);
vector<cl_uint> forceBufferFlagsVec(forceBufferFlags->getSize(), 0);
forceBufferFlags->upload(forceBufferFlagsVec);
// Create kernels. // Create kernels.
forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true); forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
...@@ -320,8 +312,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() { ...@@ -320,8 +312,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void OpenCLNonbondedUtilities::computeInteractions() { void OpenCLNonbondedUtilities::computeInteractions() {
if (cutoff != -1.0) { if (cutoff != -1.0) {
if (useCutoff) { if (useCutoff) {
forceKernel.setArg<mm_float4>(13, context.getPeriodicBoxSize()); forceKernel.setArg<mm_float4>(12, context.getPeriodicBoxSize());
forceKernel.setArg<mm_float4>(14, context.getInvPeriodicBoxSize()); forceKernel.setArg<mm_float4>(13, context.getInvPeriodicBoxSize());
} }
context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
} }
...@@ -343,14 +335,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() { ...@@ -343,14 +335,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
newSize = numTiles; newSize = numTiles;
delete interactingTiles; delete interactingTiles;
interactingTiles = new OpenCLArray<mm_ushort2>(context, newSize, "interactingTiles"); interactingTiles = new OpenCLArray<mm_ushort2>(context, newSize, "interactingTiles");
forceKernel.setArg<cl::Buffer>(11, interactingTiles->getDeviceBuffer()); forceKernel.setArg<cl::Buffer>(10, interactingTiles->getDeviceBuffer());
forceKernel.setArg<cl_uint>(15, newSize); forceKernel.setArg<cl_uint>(14, newSize);
findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(9, newSize); findInteractingBlocksKernel.setArg<cl_uint>(9, newSize);
if (context.getSIMDWidth() == 32 || deviceIsCpu) { if (context.getSIMDWidth() == 32 || deviceIsCpu) {
delete interactionFlags; delete interactionFlags;
interactionFlags = new OpenCLArray<cl_uint>(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags"); interactionFlags = new OpenCLArray<cl_uint>(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags");
forceKernel.setArg<cl::Buffer>(16, interactionFlags->getDeviceBuffer()); forceKernel.setArg<cl::Buffer>(15, interactionFlags->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
...@@ -503,7 +495,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -503,7 +495,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel.setArg(index++, 4*forceThreadBlockSize*sizeof(cl_float), NULL); kernel.setArg(index++, 4*forceThreadBlockSize*sizeof(cl_float), NULL);
kernel.setArg<cl_uint>(index++, startTileIndex); kernel.setArg<cl_uint>(index++, startTileIndex);
kernel.setArg<cl_uint>(index++, startTileIndex+numTiles); kernel.setArg<cl_uint>(index++, startTileIndex+numTiles);
kernel.setArg<cl::Buffer>(index++, forceBufferFlags->getDeviceBuffer());
if (useCutoff) { if (useCutoff) {
kernel.setArg<cl::Buffer>(index++, interactingTiles->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, interactingTiles->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactionCount->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, interactionCount->getDeviceBuffer());
......
...@@ -196,12 +196,6 @@ public: ...@@ -196,12 +196,6 @@ public:
OpenCLArray<cl_uint>& getExclusionRowIndices() { OpenCLArray<cl_uint>& getExclusionRowIndices() {
return *exclusionRowIndices; return *exclusionRowIndices;
} }
/**
* Get the array which contains flags for reserving force buffers.
*/
OpenCLArray<cl_uint>& getForceBufferFlags() {
return *forceBufferFlags;
}
/** /**
* Get the index of the first tile this context is responsible for processing. * Get the index of the first tile this context is responsible for processing.
*/ */
...@@ -245,7 +239,6 @@ private: ...@@ -245,7 +239,6 @@ private:
OpenCLArray<cl_uint>* interactionCount; OpenCLArray<cl_uint>* interactionCount;
OpenCLArray<mm_float4>* blockCenter; OpenCLArray<mm_float4>* blockCenter;
OpenCLArray<mm_float4>* blockBoundingBox; OpenCLArray<mm_float4>* blockBoundingBox;
OpenCLArray<cl_uint>* forceBufferFlags;
std::vector<std::vector<int> > atomExclusions; std::vector<std::vector<int> > atomExclusions;
std::vector<ParameterInfo> parameters; std::vector<ParameterInfo> parameters;
std::vector<ParameterInfo> arguments; std::vector<ParameterInfo> arguments;
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
__kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force, __kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices, __global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempBuffer, __global unsigned int* forceBufferFlags, __global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else #else
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force, void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices, __global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer, __global unsigned int* forceBufferFlags, __global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
#else #else
......
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#define TILE_SIZE 32 #define TILE_SIZE 32
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset1] += deriv##INDEX##_1; #define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset2] += local_deriv##INDEX[get_local_id(0)]; #define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[get_local_id(0)];
/**
* Mark that a block in the force buffer is in use.
*/
void reserveBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
while (atom_cmpxchg(&forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)], 0, 1) != 0)
;
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
/**
* Mark that a block in the force buffer is no longer in use.
*/
void releaseBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
mem_fence(CLK_GLOBAL_MEM_FENCE);
if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)] = 0;
}
/** /**
* Compute a force based on pair interactions. * Compute a force based on pair interactions.
*/ */
__kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force, __kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices, __global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempBuffer, __global unsigned int* forceBufferFlags, __global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else #else
...@@ -48,132 +29,67 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene ...@@ -48,132 +29,67 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2*WARPS_PER_GROUP]; __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP]; __local int exclusionIndex[WARPS_PER_GROUP];
__local int2* reservedBlocks = (__local int2*) exclusionRange;
while (pos < end) {
do {
// Extract the coordinates of this tile // Extract the coordinates of this tile
unsigned int x, y;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx; const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE; const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int x, y;
float4 force = 0.0f; float4 force = 0.0f;
float4 posq1 = posq[atom1]; DECLARE_ATOM1_DERIVATIVES
LOAD_ATOM1_PARAMETERS if (pos < end) {
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (numTiles <= maxTiles) {
#endif ushort2 tileIndices = tiles[pos];
float r = SQRT(r2); x = tileIndices.x;
LOAD_ATOM2_PARAMETERS y = tileIndices.y;
atom2 = y*TILE_SIZE+j; }
float dEdR = 0.0f; else
float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif #endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
} }
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Write results // Locate the exclusion data for this tile.
reserveBuffer(x, forceBufferFlags);
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
STORE_DERIVATIVES_1
releaseBuffer(x, forceBufferFlags);
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0); #ifdef USE_EXCLUSIONS
if (lasty != y) { if (tgx < 2)
unsigned int j = y*TILE_SIZE + tgx; exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
local_posq[localAtomIndex] = posq[j]; if (tgx == 0)
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL exclusionIndex[localGroupIndex] = -1;
} for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
local_force[localAtomIndex] = 0.0f; if (exclusionIndices[i] == y)
CLEAR_LOCAL_DERIVATIVES exclusionIndex[localGroupIndex] = i*TILE_SIZE;
#ifdef USE_CUTOFF bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF); #else
if (!hasExclusions && flags == 0) { bool hasExclusions = false;
// No interactions in this tile.
}
else
#endif #endif
{ if (pos >= end)
// Compute the full set of interactions in this tile. ; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF); unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif #endif
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = tbx+tj; int atom2 = tbx+j;
float4 posq2 = local_posq[atom2]; float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -187,44 +103,149 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene ...@@ -187,44 +103,149 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
#endif #endif
float r = SQRT(r2); float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj; atom2 = y*TILE_SIZE+j;
float dEdR = 0.0f; float dEdR = 0.0f;
float tempEnergy = 0.0f; float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
} }
energy += tempEnergy; energy += 0.5f*tempEnergy;
delta.xyz *= dEdR; delta.xyz *= dEdR;
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
atom2 = tbx+tj;
local_force[atom2].xyz += delta.xyz;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
excl >>= 1; excl >>= 1;
#endif #endif
tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
else {
// This is an off-diagonal tile.
// Write results const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_force[localAtomIndex] = 0.0f;
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags == 0) {
// No interactions in this tile.
}
else
#endif
{
// Compute the full set of interactions in this tile.
reserveBuffer(x, forceBufferFlags); #ifdef USE_EXCLUSIONS
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
forceBuffers[offset1].xyz += force.xyz; excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
STORE_DERIVATIVES_1 #endif
releaseBuffer(x, forceBufferFlags); unsigned int tj = tgx;
reserveBuffer(y, forceBufferFlags); for (unsigned int j = 0; j < TILE_SIZE; j++) {
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; #ifdef USE_EXCLUSIONS
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz; bool isExcluded = !(excl & 0x1);
STORE_DERIVATIVES_2 #endif
releaseBuffer(y, forceBufferFlags); int atom2 = tbx+tj;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
atom2 = tbx+tj;
local_force[atom2].xyz += delta.xyz;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
} }
lasty = y; lasty = y;
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
bool done = false;
int doneIndex = 0;
int checkIndex = 0;
while (true) {
// See if any warp still needs to write its data.
bool allDone = true;
barrier(CLK_LOCAL_MEM_FENCE);
while (doneIndex < WARPS_PER_GROUP && allDone) {
if (reservedBlocks[doneIndex].x != -1)
allDone = false;
else
doneIndex++;
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
bool canWrite = (writeX != -1);
while (checkIndex < localGroupIndex && canWrite) {
if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
canWrite = false;
else
checkIndex++;
}
if (canWrite) {
// Write the data to global memory, then mark this warp as done.
if (writeX > -1) {
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1
}
if (writeY > -1) {
const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += local_force[get_local_id(0)].xyz;
STORE_DERIVATIVES_2
}
done = true;
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(-1, -1);
}
}
}
pos++; pos++;
} } while (pos < end);
energyBuffer[get_global_id(0)] += energy; energyBuffer[get_global_id(0)] += energy;
} }
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
__kernel void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __kernel void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer, __global unsigned int* forceBufferFlags, __local float* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else #else
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer, __global unsigned int* forceBufferFlags, __local float* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
#else #else
......
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#define TILE_SIZE 32 #define TILE_SIZE 32
/**
* Mark that a block in the value buffer is in use.
*/
void reserveBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
while (atom_cmpxchg(&forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)], 0, 1) != 0)
;
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
/**
* Mark that a block in the value buffer is no longer in use.
*/
void releaseBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
mem_fence(CLK_GLOBAL_MEM_FENCE);
if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)] = 0;
}
/** /**
* Compute a value based on pair interactions. * Compute a value based on pair interactions.
*/ */
__kernel void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __kernel void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer, __global unsigned int* forceBufferFlags, __local float* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else #else
...@@ -46,222 +27,262 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq, ...@@ -46,222 +27,262 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2*WARPS_PER_GROUP]; __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP]; __local int exclusionIndex[WARPS_PER_GROUP];
__local int2* reservedBlocks = (__local int2*) exclusionRange;
while (pos < end) {
do {
// Extract the coordinates of this tile // Extract the coordinates of this tile
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y; unsigned int x, y;
float value = 0.0f;
if (pos < end) {
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos]; ushort2 tileIndices = tiles[pos];
x = tileIndices.x; x = tileIndices.x;
y = tileIndices.y; y = tileIndices.y;
} }
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
} }
} unsigned int atom1 = x*TILE_SIZE + tgx;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); float4 posq1 = posq[atom1];
const unsigned int tbx = get_local_id(0) - tgx; LOAD_ATOM1_PARAMETERS
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx;
float value = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile. // Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
if (tgx < 2) if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx]; exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0) if (tgx == 0)
exclusionIndex[localGroupIndex] = -1; exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE) for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y) if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE; exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1); bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else #else
bool hasExclusions = false; bool hasExclusions = false;
#endif #endif
if (x == y) { if (pos >= end)
// This tile is on the diagonal. ; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0); const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1; local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx]; unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif #endif
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = tbx+j; int atom2 = tbx+j;
float4 posq2 = local_posq[atom2]; float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
float r = SQRT(r2); float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
float tempValue1 = 0.0f; float tempValue1 = 0.0f;
float tempValue2 = 0.0f; float tempValue2 = 0.0f;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#else #else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif #endif
COMPUTE_VALUE COMPUTE_VALUE
} }
value += tempValue1; value += tempValue1;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
excl >>= 1; excl >>= 1;
#endif #endif
}
} }
else {
// This is an off-diagonal tile.
// Write results if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
reserveBuffer(x, forceBufferFlags); local_posq[get_local_id(0)] = posq[j];
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int localAtomIndex = get_local_id(0);
global_value[offset] += value; LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
releaseBuffer(x, forceBufferFlags);
}
else {
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j];
const unsigned int localAtomIndex = get_local_id(0);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_value[get_local_id(0)] = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
} }
else { local_value[get_local_id(0)] = 0.0f;
// Compute only a subset of the interactions in this tile. #ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) { if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j; int atom2 = tbx+j;
float4 posq2 = local_posq[atom2]; float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float tempValue1 = 0.0f; float tempValue1 = 0.0f;
float tempValue2 = 0.0f; float tempValue2 = 0.0f;
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
float r = SQRT(r2); float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_VALUE COMPUTE_VALUE
}
value += tempValue1;
} }
value += tempValue1; tempBuffer[get_local_id(0)] = tempValue2;
}
tempBuffer[get_local_id(0)] = tempValue2;
// Sum the forces on atom2. // Sum the forces on atom2.
if (tgx % 2 == 0) if (tgx % 2 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1]; tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
if (tgx % 4 == 0) if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2]; tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
if (tgx % 8 == 0) if (tgx % 8 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4]; tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
if (tgx % 16 == 0) if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8]; tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
if (tgx == 0) if (tgx == 0)
local_value[tbx+j] += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16]; local_value[tbx+j] += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
}
} }
} }
} }
} else
else
#endif #endif
{ {
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF); unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx)); excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif #endif
unsigned int tj = tgx; unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = tbx+tj; int atom2 = tbx+tj;
float4 posq2 = local_posq[atom2]; float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
float r = SQRT(r2); float r = SQRT(r2);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj; atom2 = y*TILE_SIZE+tj;
float tempValue1 = 0.0f; float tempValue1 = 0.0f;
float tempValue2 = 0.0f; float tempValue2 = 0.0f;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#else #else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif #endif
COMPUTE_VALUE COMPUTE_VALUE
} }
value += tempValue1; value += tempValue1;
local_value[tbx+tj] += tempValue2; local_value[tbx+tj] += tempValue2;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
excl >>= 1; excl >>= 1;
#endif #endif
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
}
} }
} }
}
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
bool done = false;
int doneIndex = 0;
int checkIndex = 0;
while (true) {
// See if any warp still needs to write its data.
// Write results bool allDone = true;
barrier(CLK_LOCAL_MEM_FENCE);
while (doneIndex < WARPS_PER_GROUP && allDone) {
if (reservedBlocks[doneIndex].x != -1)
allDone = false;
else
doneIndex++;
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
reserveBuffer(x, forceBufferFlags); bool canWrite = (writeX != -1);
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; while (checkIndex < localGroupIndex && canWrite) {
global_value[offset1] += value; if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
releaseBuffer(x, forceBufferFlags); (writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
reserveBuffer(y, forceBufferFlags); canWrite = false;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; else
global_value[offset2] += local_value[get_local_id(0)]; checkIndex++;
releaseBuffer(y, forceBufferFlags); }
if (canWrite) {
// Write the data to global memory, then mark this warp as done.
if (writeX > -1) {
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset] += value;
}
if (writeY > -1) {
const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset] += local_value[get_local_id(0)];
}
done = true;
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(-1, -1);
}
}
} }
lasty = y; lasty = y;
pos++; pos++;
} } while (pos < end);
} }
...@@ -15,7 +15,7 @@ typedef struct { ...@@ -15,7 +15,7 @@ typedef struct {
*/ */
__kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
__local AtomData* localData, __local float* tempBuffer, __global unsigned int* forceBufferFlags, __local AtomData* localData, __local float* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) { __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
#else #else
...@@ -192,7 +192,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po ...@@ -192,7 +192,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer, __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce, __global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__local AtomData* localData, __local float4* tempBuffer, __global unsigned int* forceBufferFlags, __local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) { __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
#else #else
......
...@@ -16,7 +16,7 @@ typedef struct { ...@@ -16,7 +16,7 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
__local AtomData* localData, __local float* tempBuffer, __global unsigned int* forceBufferFlags, __local AtomData* localData, __local float* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) { __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else #else
...@@ -203,7 +203,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo ...@@ -203,7 +203,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer, void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce, __global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__local AtomData* localData, __local float4* tempBuffer, __global unsigned int* forceBufferFlags, __local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) { __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else #else
......
...@@ -11,30 +11,11 @@ typedef struct { ...@@ -11,30 +11,11 @@ typedef struct {
float bornForce; float bornForce;
} AtomData; } AtomData;
/**
* Mark that a block in the force buffer is in use.
*/
void reserveBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
while (atom_cmpxchg(&forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)], 0, 1) != 0)
;
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
/**
* Mark that a block in the force buffer is no longer in use.
*/
void releaseBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
mem_fence(CLK_GLOBAL_MEM_FENCE);
if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)] = 0;
}
/** /**
* Compute the Born sum. * Compute the Born sum.
*/ */
__kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
__local AtomData* localData, __local float* tempBuffer, __global unsigned int* forceBufferFlags, __local AtomData* localData, __local float* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) { __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
#else #else
...@@ -51,231 +32,271 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po ...@@ -51,231 +32,271 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
unsigned int end = (warp+1)*numTiles/totalWarps; unsigned int end = (warp+1)*numTiles/totalWarps;
#endif #endif
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local int2 reservedBlocks[WARPS_PER_GROUP];
while (pos < end) {
do {
// Extract the coordinates of this tile // Extract the coordinates of this tile
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y; unsigned int x, y;
float bornSum = 0.0f;
if (pos < end) {
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos]; ushort2 tileIndices = tiles[pos];
x = tileIndices.x; x = tileIndices.x;
y = tileIndices.y; y = tileIndices.y;
} }
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
} }
} unsigned int atom1 = x*TILE_SIZE + tgx;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); float4 posq1 = posq[atom1];
const unsigned int tbx = get_local_id(0) - tgx; float2 params1 = global_params[atom1];
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE; if (pos >= end)
unsigned int atom1 = x*TILE_SIZE + tgx; ; // This warp is done.
float bornSum = 0.0f; else if (x == y) {
float4 posq1 = posq[atom1]; // This tile is on the diagonal.
float2 params1 = global_params[atom1];
if (x == y) {
// This tile is on the diagonal.
localData[get_local_id(0)].x = posq1.x; localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y; localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z; localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w; localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].radius = params1.x; localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y; localData[get_local_id(0)].scaledRadius = params1.y;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f); float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else #else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif #endif
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius); float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float rScaledRadiusJ = r+params2.y; float rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) { if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y))); float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ); float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij)); float ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r) if (params1.x < params2.x-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij); bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
} }
} }
} }
else {
// This is an off-diagonal tile.
// Write results if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
reserveBuffer(x, forceBufferFlags); float4 tempPosq = posq[j];
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; localData[get_local_id(0)].x = tempPosq.x;
global_bornSum[offset] += bornSum; localData[get_local_id(0)].y = tempPosq.y;
releaseBuffer(x, forceBufferFlags); localData[get_local_id(0)].z = tempPosq.z;
} localData[get_local_id(0)].q = tempPosq.w;
else { float2 tempParams = global_params[j];
// This is an off-diagonal tile. localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
}
localData[get_local_id(0)].bornSum = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
if (flags == 0) {
// No interactions in this tile.
} }
else { localData[get_local_id(0)].bornSum = 0.0f;
// Compute only a subset of the interactions in this tile. #ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) { if ((flags&(1<<j)) != 0) {
float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f); float4 delta = (float4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
tempBuffer[get_local_id(0)] = 0.0f; tempBuffer[get_local_id(0)] = 0.0f;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else #else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif #endif
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius); float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float rScaledRadiusJ = r+params2.y; float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y))); float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ); float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij)); float ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r) if (params1.x < params2.x-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij); bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
float rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
tempBuffer[get_local_id(0)] = term;
}
} }
float rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij));
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
tempBuffer[get_local_id(0)] = term;
}
}
// Sum the forces on atom j. // Sum the forces on atom j.
if (tgx % 2 == 0) if (tgx % 2 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1]; tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
if (tgx % 4 == 0) if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2]; tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
if (tgx % 8 == 0) if (tgx % 8 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4]; tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
if (tgx % 16 == 0) if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8]; tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
if (tgx == 0) if (tgx == 0)
localData[tbx+j].bornSum += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16]; localData[tbx+j].bornSum += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
}
} }
} }
} }
} else
else
#endif #endif
{ {
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
unsigned int tj = tgx; unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
float4 delta = (float4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0.0f); float4 delta = (float4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else #else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
#endif #endif
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius); float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
float rScaledRadiusJ = r+params2.y; float rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
float l_ij = RECIP(max(params1.x, fabs(r-params2.y))); float l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
float u_ij = RECIP(rScaledRadiusJ); float u_ij = RECIP(rScaledRadiusJ);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij)); float ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.x-r) if (params1.x < params2.x-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij); bornSum += 2.0f*(RECIP(params1.x)-l_ij);
} }
float rScaledRadiusI = r+params1.y; float rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) { if (params2.x < rScaledRadiusI) {
float l_ij = RECIP(max(params2.x, fabs(r-params1.y))); float l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
float u_ij = RECIP(rScaledRadiusI); float u_ij = RECIP(rScaledRadiusI);
float l_ij2 = l_ij*l_ij; float l_ij2 = l_ij*l_ij;
float u_ij2 = u_ij*u_ij; float u_ij2 = u_ij*u_ij;
float ratio = LOG(u_ij * RECIP(l_ij)); float ratio = LOG(u_ij * RECIP(l_ij));
float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + float term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2); (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.x-r) if (params2.x < params1.x-r)
term += 2.0f*(RECIP(params2.x)-l_ij); term += 2.0f*(RECIP(params2.x)-l_ij);
localData[tbx+tj].bornSum += term; localData[tbx+tj].bornSum += term;
}
} }
tj = (tj + 1) & (TILE_SIZE - 1);
} }
tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
}
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
bool done = false;
int doneIndex = 0;
int checkIndex = 0;
while (true) {
// See if any warp still needs to write its data.
bool allDone = true;
barrier(CLK_LOCAL_MEM_FENCE);
while (doneIndex < WARPS_PER_GROUP && allDone) {
if (reservedBlocks[doneIndex].x != -1)
allDone = false;
else
doneIndex++;
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
// Write results bool canWrite = (writeX != -1);
while (checkIndex < localGroupIndex && canWrite) {
if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
canWrite = false;
else
checkIndex++;
}
if (canWrite) {
// Write the data to global memory, then mark this warp as done.
reserveBuffer(x, forceBufferFlags); if (writeX > -1) {
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset1] += bornSum; global_bornSum[offset] += bornSum;
releaseBuffer(x, forceBufferFlags); }
reserveBuffer(y, forceBufferFlags); if (writeY > -1) {
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset2] += localData[get_local_id(0)].bornSum; global_bornSum[offset] += localData[get_local_id(0)].bornSum;
releaseBuffer(y, forceBufferFlags); }
done = true;
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(-1, -1);
}
}
} }
lasty = y; lasty = y;
pos++; pos++;
} } while (pos < end);
} }
/** /**
...@@ -284,7 +305,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po ...@@ -284,7 +305,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer, __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce, __global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__local AtomData* localData, __local float4* tempBuffer, __global unsigned int* forceBufferFlags, __local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) { __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
#else #else
...@@ -302,113 +323,171 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -302,113 +323,171 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
#endif #endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local int2 reservedBlocks[WARPS_PER_GROUP];
while (pos < end) {
do {
// Extract the coordinates of this tile // Extract the coordinates of this tile
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y; unsigned int x, y;
float4 force = 0.0f;
if (pos < end) {
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos]; ushort2 tileIndices = tiles[pos];
x = tileIndices.x; x = tileIndices.x;
y = tileIndices.y; y = tileIndices.y;
} }
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
} }
} unsigned int atom1 = x*TILE_SIZE + tgx;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); float4 posq1 = posq[atom1];
const unsigned int tbx = get_local_id(0) - tgx; float bornRadius1 = global_bornRadii[atom1];
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE; if (x == y) {
unsigned int atom1 = x*TILE_SIZE + tgx; // This tile is on the diagonal.
float4 force = 0.0f;
float4 posq1 = posq[atom1];
float bornRadius1 = global_bornRadii[atom1];
if (x == y) {
// This tile is on the diagonal.
localData[get_local_id(0)].x = posq1.x; localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y; localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z; localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w; localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1; localData[get_local_id(0)].bornRadius = bornRadius1;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q); float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
float bornRadius2 = localData[tbx+j].bornRadius; float bornRadius2 = localData[tbx+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2; float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij); float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij); float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm; float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2); float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator); float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2); float Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij); float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dEdR = Gpol*(1.0f - 0.25f*expTerm); float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 > CUTOFF_SQUARED) { if (r2 > CUTOFF_SQUARED) {
dEdR = 0.0f; dEdR = 0.0f;
tempEnergy = 0.0f; tempEnergy = 0.0f;
dGpol_dalpha2_ij = 0.0f; dGpol_dalpha2_ij = 0.0f;
} }
#endif #endif
force.w += dGpol_dalpha2_ij*bornRadius2; force.w += dGpol_dalpha2_ij*bornRadius2;
energy += 0.5f*tempEnergy; energy += 0.5f*tempEnergy;
delta.xyz *= dEdR; delta.xyz *= dEdR;
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
}
} }
} }
else {
// This is an off-diagonal tile.
// Write results if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
reserveBuffer(x, forceBufferFlags); float4 tempPosq = posq[j];
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; localData[get_local_id(0)].x = tempPosq.x;
forceBuffers[offset].xyz += force.xyz; localData[get_local_id(0)].y = tempPosq.y;
global_bornForce[offset] += force.w; localData[get_local_id(0)].z = tempPosq.z;
releaseBuffer(x, forceBufferFlags); localData[get_local_id(0)].q = tempPosq.w;
} localData[get_local_id(0)].bornRadius = global_bornRadii[j];
else { }
// This is an off-diagonal tile. localData[get_local_id(0)].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
if (lasty != y) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
unsigned int j = y*TILE_SIZE + tgx; if ((flags&(1<<j)) != 0) {
float4 tempPosq = posq[j]; float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
localData[get_local_id(0)].x = tempPosq.x; float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
localData[get_local_id(0)].y = tempPosq.y; #ifdef USE_PERIODIC
localData[get_local_id(0)].z = tempPosq.z; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[get_local_id(0)].q = tempPosq.w; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[get_local_id(0)].bornRadius = global_bornRadii[j]; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
} #endif
localData[get_local_id(0)].fx = 0.0f; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
localData[get_local_id(0)].fy = 0.0f; float invR = RSQRT(r2);
localData[get_local_id(0)].fz = 0.0f; float r = RECIP(invR);
localData[get_local_id(0)].fw = 0.0f; float bornRadius2 = localData[tbx+j].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF); if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
if (flags != 0xFFFFFFFF && false) { // TODO: Fix this: should be checking for exclusions #else
if (flags == 0) { if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS) {
// No interactions in this tile. #endif
dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f;
tempEnergy = 0.0f;
}
energy += tempEnergy;
force.w += dGpol_dalpha2_ij*bornRadius2;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
tempBuffer[get_local_id(0)] = (float4) (delta.xyz, dGpol_dalpha2_ij*bornRadius1);
// Sum the forces on atom j.
if (tgx % 2 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
if (tgx % 8 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
if (tgx == 0) {
float4 sum = tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
localData[tbx+j].fx += sum.x;
localData[tbx+j].fy += sum.y;
localData[tbx+j].fz += sum.z;
localData[tbx+j].fw += sum.w;
}
}
}
}
} }
else { else
// Compute only a subset of the interactions in this tile. #endif
{
// Compute the full set of interactions in this tile.
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
float4 posq2 = (float4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q); float4 posq2 = (float4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
...@@ -418,7 +497,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -418,7 +497,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
float bornRadius2 = localData[tbx+j].bornRadius; float bornRadius2 = localData[tbx+tj].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2; float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij); float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij); float expTerm = EXP(-D_ij);
...@@ -429,104 +508,83 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e ...@@ -429,104 +508,83 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij); float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dEdR = Gpol*(1.0f - 0.25f*expTerm); float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) { if (r2 > CUTOFF_SQUARED) {
#else
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS) {
#endif
dEdR = 0.0f; dEdR = 0.0f;
tempEnergy = 0.0f;
dGpol_dalpha2_ij = 0.0f; dGpol_dalpha2_ij = 0.0f;
tempEnergy = 0.0f;
} }
energy += tempEnergy; #endif
force.w += dGpol_dalpha2_ij*bornRadius2; force.w += dGpol_dalpha2_ij*bornRadius2;
energy += tempEnergy;
delta.xyz *= dEdR; delta.xyz *= dEdR;
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
tempBuffer[get_local_id(0)] = (float4) (delta.xyz, dGpol_dalpha2_ij*bornRadius1); localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
// Sum the forces on atom j. localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
if (tgx % 2 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
if (tgx % 8 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
if (tgx == 0) {
float4 sum = tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
localData[tbx+j].fx += sum.x;
localData[tbx+j].fy += sum.y;
localData[tbx+j].fz += sum.z;
localData[tbx+j].fw += sum.w;
}
} }
tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
} }
else }
#endif
{ // Write results. We need to coordinate between warps to make sure no two of them
// Compute the full set of interactions in this tile. // ever try to write to the same piece of memory at the same time.
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
bool done = false;
int doneIndex = 0;
int checkIndex = 0;
while (true) {
// See if any warp still needs to write its data.
unsigned int tj = tgx; bool allDone = true;
for (unsigned int j = 0; j < TILE_SIZE; j++) { barrier(CLK_LOCAL_MEM_FENCE);
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) { while (doneIndex < WARPS_PER_GROUP && allDone) {
float4 posq2 = (float4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q); if (reservedBlocks[doneIndex].x != -1)
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); allDone = false;
#ifdef USE_PERIODIC else
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; doneIndex++;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
float bornRadius2 = localData[tbx+tj].bornRadius;
float alpha2_ij = bornRadius1*bornRadius2;
float D_ij = r2*RECIP(4.0f*alpha2_ij);
float expTerm = EXP(-D_ij);
float denominator2 = r2 + alpha2_ij*expTerm;
float denominator = SQRT(denominator2);
float tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
float Gpol = tempEnergy*RECIP(denominator2);
float dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
float dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
if (r2 > CUTOFF_SQUARED) {
dEdR = 0.0f;
tempEnergy = 0.0f;
dGpol_dalpha2_ij = 0.0f;
}
#endif
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
} }
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
// Write results bool canWrite = (writeX != -1);
while (checkIndex < localGroupIndex && canWrite) {
if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
canWrite = false;
else
checkIndex++;
}
if (canWrite) {
// Write the data to global memory, then mark this warp as done.
reserveBuffer(x, forceBufferFlags); if (writeX > -1) {
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz; forceBuffers[offset].xyz += force.xyz;
global_bornForce[offset1] += force.w; global_bornForce[offset] += force.w;
releaseBuffer(x, forceBufferFlags); }
reserveBuffer(y, forceBufferFlags); if (writeY > -1) {
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset2] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0); forceBuffers[offset] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
global_bornForce[offset2] += localData[get_local_id(0)].fw; global_bornForce[offset] += localData[get_local_id(0)].fw;
releaseBuffer(y, forceBufferFlags); }
done = true;
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(-1, -1);
}
}
} }
lasty = y; lasty = y;
pos++; pos++;
} } while (pos < end);
energyBuffer[get_global_id(0)] += energy; energyBuffer[get_global_id(0)] += energy;
} }
...@@ -13,7 +13,7 @@ typedef struct { ...@@ -13,7 +13,7 @@ typedef struct {
__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, __kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex, __global unsigned int* forceBufferFlags, unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else #else
......
...@@ -14,7 +14,7 @@ typedef struct { ...@@ -14,7 +14,7 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex, __global unsigned int* forceBufferFlags, unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else #else
......
...@@ -8,31 +8,12 @@ typedef struct { ...@@ -8,31 +8,12 @@ typedef struct {
ATOM_PARAMETER_DATA ATOM_PARAMETER_DATA
} AtomData; } AtomData;
/**
* Mark that a block in the force buffer is in use.
*/
void reserveBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
while (atom_cmpxchg(&forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)], 0, 1) != 0)
;
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
/**
* Mark that a block in the force buffer is no longer in use.
*/
void releaseBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
mem_fence(CLK_GLOBAL_MEM_FENCE);
if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)] = 0;
}
/** /**
* Compute nonbonded interactions. * Compute nonbonded interactions.
*/ */
__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, __kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float* tempBuffer, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float* tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex, __global unsigned int* forceBufferFlags, unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else #else
...@@ -53,264 +34,306 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -53,264 +34,306 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2*WARPS_PER_GROUP]; __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP]; __local int exclusionIndex[WARPS_PER_GROUP];
__local int2* reservedBlocks = (__local int2*) exclusionRange;
while (pos < end) {
do {
// Extract the coordinates of this tile // Extract the coordinates of this tile
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y; unsigned int x, y;
float4 force = 0.0f;
if (pos < end) {
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos]; ushort2 tileIndices = tiles[pos];
x = tileIndices.x; x = tileIndices.x;
y = tileIndices.y; y = tileIndices.y;
} }
else else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
} }
} unsigned int atom1 = x*TILE_SIZE + tgx;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); float4 posq1 = posq[atom1];
const unsigned int tbx = get_local_id(0) - tgx; LOAD_ATOM1_PARAMETERS
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile. // Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
if (tgx < 2) if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx]; exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0) if (tgx == 0)
exclusionIndex[localGroupIndex] = -1; exclusionIndex[localGroupIndex] = -1;
for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE) for (int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y) if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE; exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1); bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else #else
bool hasExclusions = false; bool hasExclusions = false;
#endif #endif
if (x == y) { if (pos >= end)
// This tile is on the diagonal. ; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0); const unsigned int localAtomIndex = get_local_id(0);
localData[localAtomIndex].x = posq1.x; localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y; localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z; localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w; localData[localAtomIndex].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx]; unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif #endif
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = tbx+j; int atom2 = tbx+j;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q); float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
float dEdR = 0.0f; float dEdR = 0.0f;
#else #else
float4 dEdR1 = (float4) 0.0f; float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f; float4 dEdR2 = (float4) 0.0f;
#endif #endif
float tempEnergy = 0.0f; float tempEnergy = 0.0f;
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += 0.5f*tempEnergy; energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
force.xyz -= delta.xyz*dEdR; force.xyz -= delta.xyz*dEdR;
#else #else
force.xyz -= dEdR1.xyz; force.xyz -= dEdR1.xyz;
#endif #endif
excl >>= 1; #ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
} }
else {
// This is an off-diagonal tile.
// Write results const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y) {
reserveBuffer(x, forceBufferFlags); unsigned int j = y*TILE_SIZE + tgx;
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; float4 tempPosq = posq[j];
forceBuffers[offset].xyz += force.xyz; localData[localAtomIndex].x = tempPosq.x;
releaseBuffer(x, forceBufferFlags); localData[localAtomIndex].y = tempPosq.y;
} localData[localAtomIndex].z = tempPosq.z;
else { localData[localAtomIndex].q = tempPosq.w;
// This is an off-diagonal tile. LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData[localAtomIndex].fx = 0.0f;
localData[localAtomIndex].fy = 0.0f;
localData[localAtomIndex].fz = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
} }
else { localData[localAtomIndex].fx = 0.0f;
// Compute only a subset of the interactions in this tile. localData[localAtomIndex].fy = 0.0f;
localData[localAtomIndex].fz = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) { if ((flags&(1<<j)) != 0) {
bool isExcluded = false; bool isExcluded = false;
int atom2 = tbx+j; int atom2 = tbx+j;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q); float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
float dEdR = 0.0f; float dEdR = 0.0f;
#else #else
float4 dEdR1 = (float4) 0.0f; float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f; float4 dEdR2 = (float4) 0.0f;
#endif #endif
float tempEnergy = 0.0f; float tempEnergy = 0.0f;
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += tempEnergy; energy += tempEnergy;
int bufferIndex = 3*get_local_id(0); int bufferIndex = 3*get_local_id(0);
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
delta.xyz *= dEdR; delta.xyz *= dEdR;
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
tempBuffer[bufferIndex] = delta.x; tempBuffer[bufferIndex] = delta.x;
tempBuffer[bufferIndex+1] = delta.y; tempBuffer[bufferIndex+1] = delta.y;
tempBuffer[bufferIndex+2] = delta.z; tempBuffer[bufferIndex+2] = delta.z;
#else #else
force.xyz -= dEdR1.xyz; force.xyz -= dEdR1.xyz;
tempBuffer[bufferIndex] = dEdR2.x; tempBuffer[bufferIndex] = dEdR2.x;
tempBuffer[bufferIndex+1] = dEdR2.y; tempBuffer[bufferIndex+1] = dEdR2.y;
tempBuffer[bufferIndex+2] = dEdR2.z; tempBuffer[bufferIndex+2] = dEdR2.z;
#endif #endif
// Sum the forces on atom2. // Sum the forces on atom2.
if (tgx % 2 == 0) { if (tgx % 2 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]; tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]; tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]; tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5];
} }
if (tgx % 4 == 0) { if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+6]; tempBuffer[bufferIndex] += tempBuffer[bufferIndex+6];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+7]; tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+7];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+8]; tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+8];
} }
if (tgx % 8 == 0) { if (tgx % 8 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+12]; tempBuffer[bufferIndex] += tempBuffer[bufferIndex+12];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+13]; tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+13];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+14]; tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+14];
} }
if (tgx % 16 == 0) { if (tgx % 16 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+24]; tempBuffer[bufferIndex] += tempBuffer[bufferIndex+24];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+25]; tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+25];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+26]; tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+26];
} }
if (tgx == 0) { if (tgx == 0) {
localData[tbx+j].fx += tempBuffer[bufferIndex] + tempBuffer[bufferIndex+48]; localData[tbx+j].fx += tempBuffer[bufferIndex] + tempBuffer[bufferIndex+48];
localData[tbx+j].fy += tempBuffer[bufferIndex+1] + tempBuffer[bufferIndex+49]; localData[tbx+j].fy += tempBuffer[bufferIndex+1] + tempBuffer[bufferIndex+49];
localData[tbx+j].fz += tempBuffer[bufferIndex+2] + tempBuffer[bufferIndex+50]; localData[tbx+j].fz += tempBuffer[bufferIndex+2] + tempBuffer[bufferIndex+50];
}
} }
} }
} }
} }
} else
else
#endif #endif
{ {
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF); unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx)); excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif #endif
unsigned int tj = tgx; unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = tbx+tj; int atom2 = tbx+tj;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q); float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2); float invR = RSQRT(r2);
float r = RECIP(invR); float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj; atom2 = y*TILE_SIZE+tj;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
float dEdR = 0.0f; float dEdR = 0.0f;
#else #else
float4 dEdR1 = (float4) 0.0f; float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f; float4 dEdR2 = (float4) 0.0f;
#endif #endif
float tempEnergy = 0.0f; float tempEnergy = 0.0f;
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += tempEnergy; energy += tempEnergy;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
delta.xyz *= dEdR; delta.xyz *= dEdR;
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
localData[tbx+tj].fx += delta.x; localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y; localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z; localData[tbx+tj].fz += delta.z;
#else #else
force.xyz -= dEdR1.xyz; force.xyz -= dEdR1.xyz;
localData[tbx+tj].fx += dEdR2.x; localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y; localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z; localData[tbx+tj].fz += dEdR2.z;
#endif #endif
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
excl >>= 1; excl >>= 1;
#endif #endif
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
}
} }
} }
}
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
bool done = false;
int doneIndex = 0;
int checkIndex = 0;
while (true) {
// See if any warp still needs to write its data.
// Write results bool allDone = true;
barrier(CLK_LOCAL_MEM_FENCE);
while (doneIndex < WARPS_PER_GROUP && allDone) {
if (reservedBlocks[doneIndex].x != -1)
allDone = false;
else
doneIndex++;
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
reserveBuffer(x, forceBufferFlags); bool canWrite = (writeX != -1);
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; while (checkIndex < localGroupIndex && canWrite) {
forceBuffers[offset1].xyz += force.xyz; if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
releaseBuffer(x, forceBufferFlags); (writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
reserveBuffer(y, forceBufferFlags); canWrite = false;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; else
forceBuffers[offset2] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f); checkIndex++;
releaseBuffer(y, forceBufferFlags); }
if (canWrite) {
// Write the data to global memory, then mark this warp as done.
if (writeX > -1) {
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
}
if (writeY > -1) {
const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
}
done = true;
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(-1, -1);
}
}
} }
lasty = y; lasty = y;
pos++; pos++;
} } while (pos < end);
energyBuffer[get_global_id(0)] += energy; energyBuffer[get_global_id(0)] += energy;
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment