Commit 4ea10969 authored by peastman's avatar peastman
Browse files

Finished OpenCL implementation of interaction groups

parent 54c0ca3f
......@@ -2178,11 +2178,19 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
vector<mm_int4> groupData;
for (int tileSet = 0; tileSet < numTileSets; tileSet++) {
int indexInTileSet = 0;
int minSize = 0;
if (cl.getSIMDWidth() < 32) {
// We need to include a barrier inside the inner loop, so ensure that all
// threads will loop the same number of times.
for (int i = tileSetStart[tileSet]; i < tileSetStart[tileSet+1]; i++)
minSize = max(minSize, (int) atomLists[tiles[tileOrder[i].second].first].size());
}
for (int i = tileSetStart[tileSet]; i < tileSetStart[tileSet+1]; i++) {
int tile = tileOrder[i].second;
vector<int>& atoms1 = atomLists[tiles[tile].first];
vector<int>& atoms2 = atomLists[tiles[tile].second];
int range = indexInTileSet + ((indexInTileSet+atoms1.size())<<16);
int range = indexInTileSet + ((indexInTileSet+max(minSize, (int) atoms1.size()))<<16);
int allFlags = (1<<atoms2.size())-1;
for (int j = 0; j < (int) atoms1.size(); j++) {
int a1 = atoms1[j];
......@@ -2193,7 +2201,7 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
indexInTileSet += atoms1.size();
}
for (; indexInTileSet < 32; indexInTileSet++)
groupData.push_back(mm_int4(0, 0, 0, 0));
groupData.push_back(mm_int4(0, 0, minSize<<16, 0));
}
interactionGroupData = OpenCLArray::create<mm_int4>(cl, groupData.size(), "interactionGroupData");
interactionGroupData->upload(groupData);
......@@ -2242,7 +2250,7 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
if (buffers[i].getNumComponents() == 1)
load2<<buffers[i].getType()<<" params"<<(i+1)<<"2 = localData[localIndex].params"<<(i+1)<<";\n";
else {
load2<<buffers[i].getType()<<" params"<<(i+1)<<"2 = make_"<<buffers[i].getType()<<"(";
load2<<buffers[i].getType()<<" params"<<(i+1)<<"2 = ("<<buffers[i].getType()<<") (";
for (int j = 0; j < buffers[i].getNumComponents(); ++j) {
if (j > 0)
load2<<", ";
......@@ -2299,7 +2307,8 @@ double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
if (!hasInitializedKernel) {
hasInitializedKernel = true;
int index = 0;
interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getLongForceBuffer().getDeviceBuffer());
bool useLong = cl.getSupports64BitGlobalAtomics();
interactionGroupKernel.setArg<cl::Buffer>(index++, (useLong ? cl.getLongForceBuffer() : cl.getForceBuffers()).getDeviceBuffer());
interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
interactionGroupKernel.setArg<cl::Buffer>(index++, interactionGroupData->getDeviceBuffer());
......@@ -2310,7 +2319,7 @@ double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
if (globals != NULL)
interactionGroupKernel.setArg<cl::Buffer>(index++, globals->getDeviceBuffer());
}
int forceThreadBlockSize = cl.getNonbondedUtilities().getForceThreadBlockSize();
int forceThreadBlockSize = max(32, cl.getNonbondedUtilities().getForceThreadBlockSize());
cl.executeKernel(interactionGroupKernel, numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
}
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
......
......@@ -14,8 +14,37 @@ typedef struct {
#endif
} AtomData;
/**
* This function is used on devices that don't support 64 bit atomics. Multiple threads within
* a single tile might have computed forces on the same atom. This loops over them and makes sure
* that only one thread updates the force on any given atom.
*/
void writeForces(__global real4* forceBuffers,__local AtomData* localData, int atomIndex) {
localData[get_local_id(0)].x = atomIndex;
SYNC_WARPS;
real4 forceSum = (real4) 0;
int start = (get_local_id(0)/TILE_SIZE)*TILE_SIZE;
int end = start+32;
bool isFirst = true;
for (int i = start; i < end; i++)
if (localData[i].x == atomIndex) {
forceSum += (real4) (localData[i].fx, localData[i].fy, localData[i].fz, 0);
isFirst &= (i >= get_local_id(0));
}
const unsigned int warp = get_global_id(0)/TILE_SIZE;
unsigned int offset = atomIndex + warp*PADDED_NUM_ATOMS;
if (isFirst)
forceBuffers[offset] += forceSum;
SYNC_WARPS;
}
__kernel void computeInteractionGroups(
__global long* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers,
#else
__global real4* restrict forceBuffers,
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq,
__global const int4* restrict groupData, real4 periodicBoxSize, real4 invPeriodicBoxSize
PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
......@@ -78,6 +107,7 @@ __kernel void computeInteractionGroups(
tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
SYNC_WARPS;
}
#ifdef SUPPORTS_64_BIT_ATOMICS
if (exclusions != 0) {
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
......@@ -86,6 +116,13 @@ __kernel void computeInteractionGroups(
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
}
#else
writeForces(forceBuffers, localData, atom2);
localData[get_local_id(0)].fx = force.x;
localData[get_local_id(0)].fy = force.y;
localData[get_local_id(0)].fz = force.z;
writeForces(forceBuffers, localData, atom1);
#endif
}
energyBuffer[get_global_id(0)] += energy;
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment