"platforms/cuda/vscode:/vscode.git/clone" did not exist on "f6c70d6e22d71da652480a2cb1fc38dc84ace8a6"
Commit 4ea10969 authored by peastman's avatar peastman
Browse files

Finished OpenCL implementation of interaction groups

parent 54c0ca3f
...@@ -2178,11 +2178,19 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon ...@@ -2178,11 +2178,19 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
vector<mm_int4> groupData; vector<mm_int4> groupData;
for (int tileSet = 0; tileSet < numTileSets; tileSet++) { for (int tileSet = 0; tileSet < numTileSets; tileSet++) {
int indexInTileSet = 0; int indexInTileSet = 0;
int minSize = 0;
if (cl.getSIMDWidth() < 32) {
// We need to include a barrier inside the inner loop, so ensure that all
// threads will loop the same number of times.
for (int i = tileSetStart[tileSet]; i < tileSetStart[tileSet+1]; i++)
minSize = max(minSize, (int) atomLists[tiles[tileOrder[i].second].first].size());
}
for (int i = tileSetStart[tileSet]; i < tileSetStart[tileSet+1]; i++) { for (int i = tileSetStart[tileSet]; i < tileSetStart[tileSet+1]; i++) {
int tile = tileOrder[i].second; int tile = tileOrder[i].second;
vector<int>& atoms1 = atomLists[tiles[tile].first]; vector<int>& atoms1 = atomLists[tiles[tile].first];
vector<int>& atoms2 = atomLists[tiles[tile].second]; vector<int>& atoms2 = atomLists[tiles[tile].second];
int range = indexInTileSet + ((indexInTileSet+atoms1.size())<<16); int range = indexInTileSet + ((indexInTileSet+max(minSize, (int) atoms1.size()))<<16);
int allFlags = (1<<atoms2.size())-1; int allFlags = (1<<atoms2.size())-1;
for (int j = 0; j < (int) atoms1.size(); j++) { for (int j = 0; j < (int) atoms1.size(); j++) {
int a1 = atoms1[j]; int a1 = atoms1[j];
...@@ -2193,7 +2201,7 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon ...@@ -2193,7 +2201,7 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
indexInTileSet += atoms1.size(); indexInTileSet += atoms1.size();
} }
for (; indexInTileSet < 32; indexInTileSet++) for (; indexInTileSet < 32; indexInTileSet++)
groupData.push_back(mm_int4(0, 0, 0, 0)); groupData.push_back(mm_int4(0, 0, minSize<<16, 0));
} }
interactionGroupData = OpenCLArray::create<mm_int4>(cl, groupData.size(), "interactionGroupData"); interactionGroupData = OpenCLArray::create<mm_int4>(cl, groupData.size(), "interactionGroupData");
interactionGroupData->upload(groupData); interactionGroupData->upload(groupData);
...@@ -2242,7 +2250,7 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon ...@@ -2242,7 +2250,7 @@ void OpenCLCalcCustomNonbondedForceKernel::initInteractionGroups(const CustomNon
if (buffers[i].getNumComponents() == 1) if (buffers[i].getNumComponents() == 1)
load2<<buffers[i].getType()<<" params"<<(i+1)<<"2 = localData[localIndex].params"<<(i+1)<<";\n"; load2<<buffers[i].getType()<<" params"<<(i+1)<<"2 = localData[localIndex].params"<<(i+1)<<";\n";
else { else {
load2<<buffers[i].getType()<<" params"<<(i+1)<<"2 = make_"<<buffers[i].getType()<<"("; load2<<buffers[i].getType()<<" params"<<(i+1)<<"2 = ("<<buffers[i].getType()<<") (";
for (int j = 0; j < buffers[i].getNumComponents(); ++j) { for (int j = 0; j < buffers[i].getNumComponents(); ++j) {
if (j > 0) if (j > 0)
load2<<", "; load2<<", ";
...@@ -2299,7 +2307,8 @@ double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool ...@@ -2299,7 +2307,8 @@ double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
if (!hasInitializedKernel) { if (!hasInitializedKernel) {
hasInitializedKernel = true; hasInitializedKernel = true;
int index = 0; int index = 0;
interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getLongForceBuffer().getDeviceBuffer()); bool useLong = cl.getSupports64BitGlobalAtomics();
interactionGroupKernel.setArg<cl::Buffer>(index++, (useLong ? cl.getLongForceBuffer() : cl.getForceBuffers()).getDeviceBuffer());
interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer()); interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer()); interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
interactionGroupKernel.setArg<cl::Buffer>(index++, interactionGroupData->getDeviceBuffer()); interactionGroupKernel.setArg<cl::Buffer>(index++, interactionGroupData->getDeviceBuffer());
...@@ -2310,7 +2319,7 @@ double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool ...@@ -2310,7 +2319,7 @@ double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
if (globals != NULL) if (globals != NULL)
interactionGroupKernel.setArg<cl::Buffer>(index++, globals->getDeviceBuffer()); interactionGroupKernel.setArg<cl::Buffer>(index++, globals->getDeviceBuffer());
} }
int forceThreadBlockSize = cl.getNonbondedUtilities().getForceThreadBlockSize(); int forceThreadBlockSize = max(32, cl.getNonbondedUtilities().getForceThreadBlockSize());
cl.executeKernel(interactionGroupKernel, numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); cl.executeKernel(interactionGroupKernel, numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
} }
mm_double4 boxSize = cl.getPeriodicBoxSizeDouble(); mm_double4 boxSize = cl.getPeriodicBoxSizeDouble();
......
...@@ -14,8 +14,37 @@ typedef struct { ...@@ -14,8 +14,37 @@ typedef struct {
#endif #endif
} AtomData; } AtomData;
/**
* This function is used on devices that don't support 64 bit atomics. Multiple threads within
* a single tile might have computed forces on the same atom. This loops over them and makes sure
* that only one thread updates the force on any given atom.
*/
void writeForces(__global real4* forceBuffers,__local AtomData* localData, int atomIndex) {
localData[get_local_id(0)].x = atomIndex;
SYNC_WARPS;
real4 forceSum = (real4) 0;
int start = (get_local_id(0)/TILE_SIZE)*TILE_SIZE;
int end = start+32;
bool isFirst = true;
for (int i = start; i < end; i++)
if (localData[i].x == atomIndex) {
forceSum += (real4) (localData[i].fx, localData[i].fy, localData[i].fz, 0);
isFirst &= (i >= get_local_id(0));
}
const unsigned int warp = get_global_id(0)/TILE_SIZE;
unsigned int offset = atomIndex + warp*PADDED_NUM_ATOMS;
if (isFirst)
forceBuffers[offset] += forceSum;
SYNC_WARPS;
}
__kernel void computeInteractionGroups( __kernel void computeInteractionGroups(
__global long* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers,
#else
__global real4* restrict forceBuffers,
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq,
__global const int4* restrict groupData, real4 periodicBoxSize, real4 invPeriodicBoxSize __global const int4* restrict groupData, real4 periodicBoxSize, real4 invPeriodicBoxSize
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE; const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
...@@ -78,6 +107,7 @@ __kernel void computeInteractionGroups( ...@@ -78,6 +107,7 @@ __kernel void computeInteractionGroups(
tj = (tj == rangeEnd-1 ? rangeStart : tj+1); tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
SYNC_WARPS; SYNC_WARPS;
} }
#ifdef SUPPORTS_64_BIT_ATOMICS
if (exclusions != 0) { if (exclusions != 0) {
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000)); atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
...@@ -86,6 +116,13 @@ __kernel void computeInteractionGroups( ...@@ -86,6 +116,13 @@ __kernel void computeInteractionGroups(
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000)); atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000)); atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
} }
#else
writeForces(forceBuffers, localData, atom2);
localData[get_local_id(0)].fx = force.x;
localData[get_local_id(0)].fy = force.y;
localData[get_local_id(0)].fz = force.z;
writeForces(forceBuffers, localData, atom1);
#endif
} }
energyBuffer[get_global_id(0)] += energy; energyBuffer[get_global_id(0)] += energy;
} }
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment