Commit 7943a339 authored by Peter Eastman's avatar Peter Eastman
Browse files

Restructured the use of force buffers in a new way that hopefully really works everywhere.

parent 13ef0ee8
......@@ -1738,6 +1738,8 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks());
if (cl.getSIMDWidth() == 32)
defines["WARPS_PER_GROUP"] = OpenCLExpressionUtilities::intToString(cl.getNonbondedUtilities().getForceThreadBlockSize()/OpenCLContext::TileSize);
string file;
if (deviceIsCpu)
file = OpenCLKernelSources::gbsaObc_cpu;
......@@ -1753,7 +1755,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel.setArg<cl::Buffer>(index++, params->getDeviceBuffer());
computeBornSumKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*13*sizeof(cl_float), NULL);
computeBornSumKernel.setArg(index++, (deviceIsCpu ? 1 : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
......@@ -1773,7 +1774,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel.setArg<cl::Buffer>(index++, bornForce->getDeviceBuffer());
force1Kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*13*sizeof(cl_float), NULL);
force1Kernel.setArg(index++, (deviceIsCpu ? 1 : nb.getForceThreadBlockSize())*sizeof(mm_float4), NULL);
force1Kernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
if (nb.getUseCutoff()) {
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
......@@ -1805,14 +1805,14 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornForceKernel.setArg<cl::Buffer>(6, obcChain->getDeviceBuffer());
}
if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<mm_float4>(8, cl.getPeriodicBoxSize());
computeBornSumKernel.setArg<mm_float4>(9, cl.getInvPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize());
computeBornSumKernel.setArg<mm_float4>(7, cl.getPeriodicBoxSize());
computeBornSumKernel.setArg<mm_float4>(8, cl.getInvPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(9, cl.getPeriodicBoxSize());
force1Kernel.setArg<mm_float4>(10, cl.getInvPeriodicBoxSize());
if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize();
computeBornSumKernel.setArg<cl_uint>(10, maxTiles);
force1Kernel.setArg<cl_uint>(12, maxTiles);
force1Kernel.setArg<cl_uint>(11, maxTiles);
}
}
cl.executeKernel(computeBornSumKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
......@@ -2148,7 +2148,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
}
map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = n2EnergySource.str();
stringstream extraArgs, loadLocal1, loadLocal2, clearLocal, load1, load2, recordDeriv, storeDerivs1, storeDerivs2, declareTemps, setTemps;
stringstream extraArgs, loadLocal1, loadLocal2, clearLocal, load1, load2, declare1, recordDeriv, storeDerivs1, storeDerivs2, declareTemps, setTemps;
if (force.getNumGlobalParameters() > 0)
extraArgs << ", __constant float* globals";
for (int i = 0; i < (int) params->getBuffers().size(); i++) {
......@@ -2174,7 +2174,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
string index = intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* derivBuffers" << index << ", __local " << buffer.getType() << "* local_deriv" << index;
clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n";
load1 << buffer.getType() << " deriv" << index << "_1 = 0.0f;\n";
declare1 << buffer.getType() << " deriv" << index << "_1 = 0.0f;\n";
load2 << buffer.getType() << " deriv" << index << "_2 = 0.0f;\n";
recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n";
storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")";
......@@ -2188,6 +2188,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
replacements["CLEAR_LOCAL_DERIVATIVES"] = clearLocal.str();
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
replacements["LOAD_ATOM2_PARAMETERS"] = load2.str();
replacements["DECLARE_ATOM1_DERIVATIVES"] = declare1.str();
replacements["RECORD_DERIVATIVE_2"] = recordDeriv.str();
replacements["STORE_DERIVATIVES_1"] = storeDerivs1.str();
replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str();
......@@ -2482,7 +2483,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairValueKernel.setArg<cl::Buffer>(index++, valueBuffers->getDeviceBuffer());
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float), NULL);
pairValueKernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
if (nb.getUseCutoff()) {
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
......@@ -2531,7 +2531,6 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL);
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getForceBufferFlags().getDeviceBuffer());
if (nb.getUseCutoff()) {
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
......@@ -2609,14 +2608,14 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
globals->upload(globalParamValues);
}
if (nb.getUseCutoff()) {
pairValueKernel.setArg<mm_float4>(11, cl.getPeriodicBoxSize());
pairValueKernel.setArg<mm_float4>(12, cl.getInvPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(12, cl.getPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(13, cl.getInvPeriodicBoxSize());
pairValueKernel.setArg<mm_float4>(10, cl.getPeriodicBoxSize());
pairValueKernel.setArg<mm_float4>(11, cl.getInvPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(11, cl.getPeriodicBoxSize());
pairEnergyKernel.setArg<mm_float4>(12, cl.getInvPeriodicBoxSize());
if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize();
pairValueKernel.setArg<cl_uint>(13, maxTiles);
pairEnergyKernel.setArg<cl_uint>(14, maxTiles);
pairValueKernel.setArg<cl_uint>(12, maxTiles);
pairEnergyKernel.setArg<cl_uint>(13, maxTiles);
}
}
cl.executeKernel(pairValueKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
......
......@@ -37,7 +37,7 @@ using namespace std;
OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false),
numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), forceBufferFlags(NULL) {
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL) {
// Decide how many thread blocks and force buffers to use.
deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
......@@ -48,8 +48,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
numForceBuffers = numForceThreadBlocks;
}
else if (context.getSIMDWidth() == 32) {
numForceThreadBlocks = 2*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
forceThreadBlockSize = 256;
numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
forceThreadBlockSize = 128;
numForceBuffers = numForceThreadBlocks;
}
else {
......@@ -82,8 +82,6 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
delete blockCenter;
if (blockBoundingBox != NULL)
delete blockBoundingBox;
if (forceBufferFlags != NULL)
delete forceBufferFlags;
}
void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel) {
......@@ -239,12 +237,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
interactionCount->upload();
}
// Create the flags for reserving force buffers.
forceBufferFlags = new OpenCLArray<cl_uint>(context, numAtomBlocks*numForceThreadBlocks, "forceBufferFlags", false);
vector<cl_uint> forceBufferFlagsVec(forceBufferFlags->getSize(), 0);
forceBufferFlags->upload(forceBufferFlagsVec);
// Create kernels.
forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
......@@ -320,8 +312,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void OpenCLNonbondedUtilities::computeInteractions() {
if (cutoff != -1.0) {
if (useCutoff) {
forceKernel.setArg<mm_float4>(13, context.getPeriodicBoxSize());
forceKernel.setArg<mm_float4>(14, context.getInvPeriodicBoxSize());
forceKernel.setArg<mm_float4>(12, context.getPeriodicBoxSize());
forceKernel.setArg<mm_float4>(13, context.getInvPeriodicBoxSize());
}
context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
}
......@@ -343,14 +335,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
newSize = numTiles;
delete interactingTiles;
interactingTiles = new OpenCLArray<mm_ushort2>(context, newSize, "interactingTiles");
forceKernel.setArg<cl::Buffer>(11, interactingTiles->getDeviceBuffer());
forceKernel.setArg<cl_uint>(15, newSize);
forceKernel.setArg<cl::Buffer>(10, interactingTiles->getDeviceBuffer());
forceKernel.setArg<cl_uint>(14, newSize);
findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(9, newSize);
if (context.getSIMDWidth() == 32 || deviceIsCpu) {
delete interactionFlags;
interactionFlags = new OpenCLArray<cl_uint>(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags");
forceKernel.setArg<cl::Buffer>(16, interactionFlags->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(15, interactionFlags->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
......@@ -503,7 +495,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel.setArg(index++, 4*forceThreadBlockSize*sizeof(cl_float), NULL);
kernel.setArg<cl_uint>(index++, startTileIndex);
kernel.setArg<cl_uint>(index++, startTileIndex+numTiles);
kernel.setArg<cl::Buffer>(index++, forceBufferFlags->getDeviceBuffer());
if (useCutoff) {
kernel.setArg<cl::Buffer>(index++, interactingTiles->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactionCount->getDeviceBuffer());
......
......@@ -196,12 +196,6 @@ public:
OpenCLArray<cl_uint>& getExclusionRowIndices() {
return *exclusionRowIndices;
}
/**
* Get the array which contains flags for reserving force buffers.
*/
OpenCLArray<cl_uint>& getForceBufferFlags() {
return *forceBufferFlags;
}
/**
* Get the index of the first tile this context is responsible for processing.
*/
......@@ -245,7 +239,6 @@ private:
OpenCLArray<cl_uint>* interactionCount;
OpenCLArray<mm_float4>* blockCenter;
OpenCLArray<mm_float4>* blockBoundingBox;
OpenCLArray<cl_uint>* forceBufferFlags;
std::vector<std::vector<int> > atomExclusions;
std::vector<ParameterInfo> parameters;
std::vector<ParameterInfo> arguments;
......
......@@ -8,7 +8,7 @@
__kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempBuffer, __global unsigned int* forceBufferFlags,
__global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else
......
......@@ -9,7 +9,7 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer, __global unsigned int* forceBufferFlags,
__global unsigned int* exclusionRowIndices, __local float4* tempForceBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
#else
......
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#define TILE_SIZE 32
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset1] += deriv##INDEX##_1;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset2] += local_deriv##INDEX[get_local_id(0)];
/**
* Mark that a block in the force buffer is in use.
*/
void reserveBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
while (atom_cmpxchg(&forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)], 0, 1) != 0)
;
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
/**
* Mark that a block in the force buffer is no longer in use.
*/
void releaseBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
mem_fence(CLK_GLOBAL_MEM_FENCE);
if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)] = 0;
}
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[get_local_id(0)];
/**
* Compute a force based on pair interactions.
*/
__kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempBuffer, __global unsigned int* forceBufferFlags,
__global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else
......@@ -48,10 +29,17 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP];
__local int2* reservedBlocks = (__local int2*) exclusionRange;
while (pos < end) {
do {
// Extract the coordinates of this tile
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y;
float4 force = 0.0f;
DECLARE_ATOM1_DERIVATIVES
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
......@@ -68,11 +56,7 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
......@@ -90,7 +74,9 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
#else
bool hasExclusions = false;
#endif
if (x == y) {
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
......@@ -134,14 +120,6 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
excl >>= 1;
#endif
}
// Write results
reserveBuffer(x, forceBufferFlags);
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
STORE_DERIVATIVES_1
releaseBuffer(x, forceBufferFlags);
}
else {
// This is an off-diagonal tile.
......@@ -209,22 +187,65 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
lasty = y;
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
bool done = false;
int doneIndex = 0;
int checkIndex = 0;
while (true) {
// See if any warp still needs to write its data.
bool allDone = true;
barrier(CLK_LOCAL_MEM_FENCE);
while (doneIndex < WARPS_PER_GROUP && allDone) {
if (reservedBlocks[doneIndex].x != -1)
allDone = false;
else
doneIndex++;
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
// Write results
bool canWrite = (writeX != -1);
while (checkIndex < localGroupIndex && canWrite) {
if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
canWrite = false;
else
checkIndex++;
}
if (canWrite) {
// Write the data to global memory, then mark this warp as done.
reserveBuffer(x, forceBufferFlags);
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
if (writeX > -1) {
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1
releaseBuffer(x, forceBufferFlags);
reserveBuffer(y, forceBufferFlags);
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz;
}
if (writeY > -1) {
const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += local_force[get_local_id(0)].xyz;
STORE_DERIVATIVES_2
releaseBuffer(y, forceBufferFlags);
}
lasty = y;
pos++;
done = true;
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(-1, -1);
}
}
}
pos++;
} while (pos < end);
energyBuffer[get_global_id(0)] += energy;
}
......@@ -6,7 +6,7 @@
__kernel void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer, __global unsigned int* forceBufferFlags,
__local float* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else
......
......@@ -7,7 +7,7 @@
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer, __global unsigned int* forceBufferFlags,
__local float* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles
#else
......
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#define TILE_SIZE 32
/**
* Mark that a block in the value buffer is in use.
*/
void reserveBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
while (atom_cmpxchg(&forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)], 0, 1) != 0)
;
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
/**
* Mark that a block in the value buffer is no longer in use.
*/
void releaseBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
mem_fence(CLK_GLOBAL_MEM_FENCE);
if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)] = 0;
}
/**
* Compute a value based on pair interactions.
*/
__kernel void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer, __global unsigned int* forceBufferFlags,
__local float* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else
......@@ -46,10 +27,16 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP];
__local int2* reservedBlocks = (__local int2*) exclusionRange;
while (pos < end) {
do {
// Extract the coordinates of this tile
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y;
float value = 0.0f;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
......@@ -66,11 +53,7 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx;
float value = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
......@@ -88,7 +71,9 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
#else
bool hasExclusions = false;
#endif
if (x == y) {
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
......@@ -133,13 +118,6 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
excl >>= 1;
#endif
}
// Write results
reserveBuffer(x, forceBufferFlags);
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset] += value;
releaseBuffer(x, forceBufferFlags);
}
else {
// This is an off-diagonal tile.
......@@ -249,19 +227,62 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
bool done = false;
int doneIndex = 0;
int checkIndex = 0;
while (true) {
// See if any warp still needs to write its data.
bool allDone = true;
barrier(CLK_LOCAL_MEM_FENCE);
while (doneIndex < WARPS_PER_GROUP && allDone) {
if (reservedBlocks[doneIndex].x != -1)
allDone = false;
else
doneIndex++;
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
// Write results
bool canWrite = (writeX != -1);
while (checkIndex < localGroupIndex && canWrite) {
if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
canWrite = false;
else
checkIndex++;
}
if (canWrite) {
// Write the data to global memory, then mark this warp as done.
reserveBuffer(x, forceBufferFlags);
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset1] += value;
releaseBuffer(x, forceBufferFlags);
reserveBuffer(y, forceBufferFlags);
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset2] += local_value[get_local_id(0)];
releaseBuffer(y, forceBufferFlags);
if (writeX > -1) {
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset] += value;
}
if (writeY > -1) {
const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset] += local_value[get_local_id(0)];
}
done = true;
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(-1, -1);
}
}
}
lasty = y;
pos++;
}
} while (pos < end);
}
......@@ -15,7 +15,7 @@ typedef struct {
*/
__kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
__local AtomData* localData, __local float* tempBuffer, __global unsigned int* forceBufferFlags,
__local AtomData* localData, __local float* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
#else
......@@ -192,7 +192,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__local AtomData* localData, __local float4* tempBuffer, __global unsigned int* forceBufferFlags,
__local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
#else
......
......@@ -16,7 +16,7 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
__local AtomData* localData, __local float* tempBuffer, __global unsigned int* forceBufferFlags,
__local AtomData* localData, __local float* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else
......@@ -203,7 +203,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__local AtomData* localData, __local float4* tempBuffer, __global unsigned int* forceBufferFlags,
__local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else
......
......@@ -11,30 +11,11 @@ typedef struct {
float bornForce;
} AtomData;
/**
* Mark that a block in the force buffer is in use.
*/
void reserveBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
while (atom_cmpxchg(&forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)], 0, 1) != 0)
;
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
/**
* Mark that a block in the force buffer is no longer in use.
*/
void releaseBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
mem_fence(CLK_GLOBAL_MEM_FENCE);
if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)] = 0;
}
/**
* Compute the Born sum.
*/
__kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
__local AtomData* localData, __local float* tempBuffer, __global unsigned int* forceBufferFlags,
__local AtomData* localData, __local float* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
#else
......@@ -51,10 +32,16 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
unsigned int lasty = 0xFFFFFFFF;
__local int2 reservedBlocks[WARPS_PER_GROUP];
while (pos < end) {
do {
// Extract the coordinates of this tile
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y;
float bornSum = 0.0f;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
......@@ -71,14 +58,12 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx;
float bornSum = 0.0f;
float4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
if (x == y) {
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
localData[get_local_id(0)].x = posq1.x;
......@@ -117,13 +102,6 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
}
}
}
// Write results
reserveBuffer(x, forceBufferFlags);
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
releaseBuffer(x, forceBufferFlags);
}
else {
// This is an off-diagonal tile.
......@@ -261,21 +239,64 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
// Write results
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
reserveBuffer(x, forceBufferFlags);
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset1] += bornSum;
releaseBuffer(x, forceBufferFlags);
reserveBuffer(y, forceBufferFlags);
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset2] += localData[get_local_id(0)].bornSum;
releaseBuffer(y, forceBufferFlags);
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
bool done = false;
int doneIndex = 0;
int checkIndex = 0;
while (true) {
// See if any warp still needs to write its data.
bool allDone = true;
barrier(CLK_LOCAL_MEM_FENCE);
while (doneIndex < WARPS_PER_GROUP && allDone) {
if (reservedBlocks[doneIndex].x != -1)
allDone = false;
else
doneIndex++;
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
bool canWrite = (writeX != -1);
while (checkIndex < localGroupIndex && canWrite) {
if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
canWrite = false;
else
checkIndex++;
}
if (canWrite) {
// Write the data to global memory, then mark this warp as done.
if (writeX > -1) {
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
}
if (writeY > -1) {
const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += localData[get_local_id(0)].bornSum;
}
done = true;
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(-1, -1);
}
}
}
lasty = y;
pos++;
}
} while (pos < end);
}
/**
......@@ -284,7 +305,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__local AtomData* localData, __local float4* tempBuffer, __global unsigned int* forceBufferFlags,
__local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
#else
......@@ -302,10 +323,16 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
#endif
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local int2 reservedBlocks[WARPS_PER_GROUP];
while (pos < end) {
do {
// Extract the coordinates of this tile
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y;
float4 force = 0.0f;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
......@@ -322,11 +349,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
float bornRadius1 = global_bornRadii[atom1];
if (x == y) {
......@@ -372,14 +395,6 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
force.xyz -= delta.xyz;
}
}
// Write results
reserveBuffer(x, forceBufferFlags);
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
global_bornForce[offset] += force.w;
releaseBuffer(x, forceBufferFlags);
}
else {
// This is an off-diagonal tile.
......@@ -511,22 +526,65 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
// Write results
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
reserveBuffer(x, forceBufferFlags);
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
global_bornForce[offset1] += force.w;
releaseBuffer(x, forceBufferFlags);
reserveBuffer(y, forceBufferFlags);
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset2] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0);
global_bornForce[offset2] += localData[get_local_id(0)].fw;
releaseBuffer(y, forceBufferFlags);
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
bool done = false;
int doneIndex = 0;
int checkIndex = 0;
while (true) {
// See if any warp still needs to write its data.
bool allDone = true;
barrier(CLK_LOCAL_MEM_FENCE);
while (doneIndex < WARPS_PER_GROUP && allDone) {
if (reservedBlocks[doneIndex].x != -1)
allDone = false;
else
doneIndex++;
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
bool canWrite = (writeX != -1);
while (checkIndex < localGroupIndex && canWrite) {
if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
canWrite = false;
else
checkIndex++;
}
if (canWrite) {
// Write the data to global memory, then mark this warp as done.
if (writeX > -1) {
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
global_bornForce[offset] += force.w;
}
if (writeY > -1) {
const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
global_bornForce[offset] += localData[get_local_id(0)].fw;
}
done = true;
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(-1, -1);
}
}
}
lasty = y;
pos++;
}
} while (pos < end);
energyBuffer[get_global_id(0)] += energy;
}
......@@ -13,7 +13,7 @@ typedef struct {
__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex, __global unsigned int* forceBufferFlags,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else
......
......@@ -14,7 +14,7 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex, __global unsigned int* forceBufferFlags,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else
......
......@@ -8,31 +8,12 @@ typedef struct {
ATOM_PARAMETER_DATA
} AtomData;
/**
* Mark that a block in the force buffer is in use.
*/
void reserveBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
while (atom_cmpxchg(&forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)], 0, 1) != 0)
;
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
/**
* Mark that a block in the force buffer is no longer in use.
*/
void releaseBuffer(unsigned int block, __global unsigned int* forceBufferFlags) {
mem_fence(CLK_GLOBAL_MEM_FENCE);
if ((get_local_id(0)&(TILE_SIZE-1)) == 0)
forceBufferFlags[block+NUM_BLOCKS*get_group_id(0)] = 0;
}
/**
* Compute nonbonded interactions.
*/
__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float* tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex, __global unsigned int* forceBufferFlags,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else
......@@ -53,10 +34,16 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
unsigned int lasty = 0xFFFFFFFF;
__local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP];
__local int2* reservedBlocks = (__local int2*) exclusionRange;
while (pos < end) {
do {
// Extract the coordinates of this tile
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y;
float4 force = 0.0f;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
......@@ -73,11 +60,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
......@@ -95,7 +78,9 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
#else
bool hasExclusions = false;
#endif
if (x == y) {
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
......@@ -138,15 +123,10 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
#else
force.xyz -= dEdR1.xyz;
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
// Write results
reserveBuffer(x, forceBufferFlags);
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
releaseBuffer(x, forceBufferFlags);
}
else {
// This is an off-diagonal tile.
......@@ -297,20 +277,63 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
// Write results
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
bool done = false;
int doneIndex = 0;
int checkIndex = 0;
while (true) {
// See if any warp still needs to write its data.
reserveBuffer(x, forceBufferFlags);
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
releaseBuffer(x, forceBufferFlags);
reserveBuffer(y, forceBufferFlags);
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset2] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
releaseBuffer(y, forceBufferFlags);
bool allDone = true;
barrier(CLK_LOCAL_MEM_FENCE);
while (doneIndex < WARPS_PER_GROUP && allDone) {
if (reservedBlocks[doneIndex].x != -1)
allDone = false;
else
doneIndex++;
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
bool canWrite = (writeX != -1);
while (checkIndex < localGroupIndex && canWrite) {
if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
canWrite = false;
else
checkIndex++;
}
if (canWrite) {
// Write the data to global memory, then mark this warp as done.
if (writeX > -1) {
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
}
if (writeY > -1) {
const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
}
done = true;
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(-1, -1);
}
}
}
lasty = y;
pos++;
}
} while (pos < end);
energyBuffer[get_global_id(0)] += energy;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment