"platforms/opencl/tests/TestOpenCLNonbondedForce.cpp" did not exist on "644cc275ab5a549a82cce4d49680da729c2051e6"
Commit dd352ee5 authored by Peter Eastman's avatar Peter Eastman
Browse files

Added dynamic load balancing between GPUs

parent 6e3526b4
......@@ -233,8 +233,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
if (useCutoff) {
map<string, string> defines;
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks());
defines["START_TILE_INDEX"] = OpenCLExpressionUtilities::intToString(startTileIndex);
defines["END_TILE_INDEX"] = OpenCLExpressionUtilities::intToString(startTileIndex+numTiles);
if (forceBufferPerAtomBlock)
defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
if (usePeriodic)
......@@ -256,6 +254,8 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(8, context.getPosq().getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(9, interactingTiles->getSize());
findInteractingBlocksKernel.setArg<cl_uint>(10, startTileIndex);
findInteractingBlocksKernel.setArg<cl_uint>(11, startTileIndex+numTiles);
if (context.getSIMDWidth() == 32 && !deviceIsCpu) {
findInteractionsWithinBlocksKernel = cl::Kernel(interactingBlocksProgram, "findInteractionsWithinBlocks");
findInteractionsWithinBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff));
......@@ -302,8 +302,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void OpenCLNonbondedUtilities::computeInteractions() {
if (cutoff != -1.0) {
if (useCutoff) {
forceKernel.setArg<mm_float4>(10, context.getPeriodicBoxSize());
forceKernel.setArg<mm_float4>(11, context.getInvPeriodicBoxSize());
forceKernel.setArg<mm_float4>(12, context.getPeriodicBoxSize());
forceKernel.setArg<mm_float4>(13, context.getInvPeriodicBoxSize());
}
context.executeKernel(forceKernel, (context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2)*OpenCLContext::TileSize, deviceIsCpu ? 1 : -1);
}
......@@ -325,14 +325,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
newSize = numTiles;
delete interactingTiles;
interactingTiles = new OpenCLArray<mm_ushort2>(context, newSize, "interactingTiles");
forceKernel.setArg<cl::Buffer>(8, interactingTiles->getDeviceBuffer());
forceKernel.setArg<cl_uint>(12, newSize);
forceKernel.setArg<cl::Buffer>(10, interactingTiles->getDeviceBuffer());
forceKernel.setArg<cl_uint>(14, newSize);
findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(9, newSize);
if (context.getSIMDWidth() == 32 || deviceIsCpu) {
delete interactionFlags;
interactionFlags = new OpenCLArray<cl_uint>(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags");
forceKernel.setArg<cl::Buffer>(13, interactionFlags->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(15, interactionFlags->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
......@@ -340,6 +340,19 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
}
}
void OpenCLNonbondedUtilities::setTileRange(int startTileIndex, int numTiles) {
this->startTileIndex = startTileIndex;
this->numTiles = numTiles;
if (cutoff == -1.0)
return; // There are no nonbonded interactions in the System.
forceKernel.setArg<cl_uint>(8, startTileIndex);
forceKernel.setArg<cl_uint>(9, startTileIndex+numTiles);
if (useCutoff) {
findInteractingBlocksKernel.setArg<cl_uint>(10, startTileIndex);
findInteractingBlocksKernel.setArg<cl_uint>(11, startTileIndex+numTiles);
}
}
cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector<ParameterInfo>& params, const vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const {
map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = source;
......@@ -447,8 +460,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks());
defines["START_TILE_INDEX"] = OpenCLExpressionUtilities::intToString(startTileIndex);
defines["END_TILE_INDEX"] = OpenCLExpressionUtilities::intToString(startTileIndex+numTiles);
string file;
if (deviceIsCpu)
file = OpenCLKernelSources::nonbonded_cpu;
......@@ -470,6 +481,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel.setArg<cl::Buffer>(index++, exclusionRowIndices->getDeviceBuffer());
kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize*localDataSize : OpenCLContext::ThreadBlockSize*localDataSize), NULL);
kernel.setArg(index++, 4*OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
kernel.setArg<cl_uint>(index++, startTileIndex);
kernel.setArg<cl_uint>(index++, startTileIndex+numTiles);
if (useCutoff) {
kernel.setArg<cl::Buffer>(index++, interactingTiles->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactionCount->getDeviceBuffer());
......
......@@ -190,6 +190,10 @@ public:
int getNumTiles() const {
return numTiles;
}
/**
* Set the range of tiles that should be processed by this context.
*/
void setTileRange(int startTileIndex, int numTiles);
/**
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
* are assumed to be the same as those for the default interaction Kernel, since this kernel will use
......
......@@ -29,6 +29,28 @@
using namespace OpenMM;
using namespace std;
/**
* Get the current clock time, measured in microseconds.
*/
#ifdef _MSC_VER
#include <Windows.h>
static long getTime() {
FILETIME ft;
GetSystemTimeAsFileTime(&ft); // 100-nanoseconds since 1-1-1601
ULARGE_INTEGER result;
result.LowPart = ft.dwLowDateTime;
result.HighPart = ft.dwHighDateTime;
return result/10;
}
#else
#include <sys/time.h>
static long getTime() {
struct timeval tod;
gettimeofday(&tod, 0);
return 1000000*tod.tv_sec+tod.tv_usec;
}
#endif
class OpenCLParallelCalcForcesAndEnergyKernel::BeginComputationTask : public OpenCLContext::WorkTask {
public:
BeginComputationTask(ContextImpl& context, OpenCLContext& cl, OpenCLCalcForcesAndEnergyKernel& kernel,
......@@ -52,8 +74,8 @@ private:
class OpenCLParallelCalcForcesAndEnergyKernel::FinishComputationTask : public OpenCLContext::WorkTask {
public:
FinishComputationTask(ContextImpl& context, OpenCLContext& cl, OpenCLCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, double& energy) : context(context), cl(cl), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
bool includeForce, bool includeEnergy, double& energy, long& completionTime) : context(context), cl(cl), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy), completionTime(completionTime) {
}
void execute() {
// Execute the kernel, then download forces.
......@@ -61,7 +83,7 @@ public:
energy += kernel.finishComputation(context, includeForce, includeEnergy);
if (includeForce)
cl.getForce().download();
mm_float4 f = cl.getForce()[0];
completionTime = getTime();
}
private:
ContextImpl& context;
......@@ -69,10 +91,11 @@ private:
OpenCLCalcForcesAndEnergyKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
long& completionTime;
};
OpenCLParallelCalcForcesAndEnergyKernel::OpenCLParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, OpenCLPlatform::PlatformData& data) :
CalcForcesAndEnergyKernel(name, platform), data(data) {
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextTiles(data.contexts.size()) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new OpenCLCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
}
......@@ -98,7 +121,7 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
for (int i = 0; i < (int) data.contexts.size(); i++) {
OpenCLContext& cl = *data.contexts[i];
OpenCLContext::WorkThread& thread = cl.getWorkThread();
thread.addTask(new FinishComputationTask(context, cl, getKernel(i), includeForce, includeEnergy, data.contextEnergy[i]));
thread.addTask(new FinishComputationTask(context, cl, getKernel(i), includeForce, includeEnergy, data.contextEnergy[i], completionTimes[i]));
}
data.syncContexts();
double energy = 0.0;
......@@ -107,8 +130,6 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
if (includeForce) {
// Sum the forces from all devices.
for (int i = 0; i < (int) data.contexts.size(); i++)
data.contexts[i]->getForce().download();
OpenCLArray<mm_float4>& forces = data.contexts[0]->getForce();
for (int i = 1; i < (int) data.contexts.size(); i++) {
OpenCLArray<mm_float4>& contextForces = data.contexts[i]->getForce();
......@@ -121,6 +142,32 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
}
}
forces.upload();
// Balance work between the contexts by transferring a few nonbonded tiles from the context that
// finished last to the one that finished first.
int firstIndex = 0, lastIndex = 0;
int totalTiles = 0;
for (int i = 0; i < (int) completionTimes.size(); i++) {
if (completionTimes[i] < completionTimes[firstIndex])
firstIndex = i;
if (completionTimes[i] > completionTimes[lastIndex])
lastIndex = i;
contextTiles[i] = data.contexts[i]->getNonbondedUtilities().getNumTiles();
totalTiles += contextTiles[i];
}
int tilesToTransfer = totalTiles/1000;
if (tilesToTransfer < 1)
tilesToTransfer = 1;
if (tilesToTransfer > contextTiles[lastIndex])
tilesToTransfer = contextTiles[lastIndex];
contextTiles[firstIndex] += tilesToTransfer;
contextTiles[lastIndex] -= tilesToTransfer;
int startIndex = 0;
for (int i = 0; i < (int) contextTiles.size(); i++) {
data.contexts[i]->getNonbondedUtilities().setTileRange(startIndex, contextTiles[i]);
startIndex += contextTiles[i];
}
}
return energy;
}
......
......@@ -76,6 +76,8 @@ private:
class FinishComputationTask;
OpenCLPlatform::PlatformData& data;
std::vector<Kernel> kernels;
std::vector<long> completionTimes;
std::vector<int> contextTiles;
};
/**
......
......@@ -159,7 +159,8 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
*/
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* blockCenter,
__global float4* blockBoundingBox, __global unsigned int* interactionCount, __global ushort2* interactingTiles,
__global unsigned int* interactionFlags, __global float4* posq, unsigned int maxTiles) {
__global unsigned int* interactionFlags, __global float4* posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) {
__local ushort2 buffer[BUFFER_SIZE];
__local int valid[BUFFER_SIZE];
__local short sum[BUFFER_SIZE];
......@@ -172,11 +173,11 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
for (int i = 0; i < BUFFER_GROUPS; ++i)
valid[i*GROUP_SIZE+get_local_id(0)] = false;
barrier(CLK_LOCAL_MEM_FENCE);
for (int baseIndex = START_TILE_INDEX+get_group_id(0)*get_local_size(0); baseIndex < END_TILE_INDEX; baseIndex += get_global_size(0)) {
for (int baseIndex = startTileIndex+get_group_id(0)*get_local_size(0); baseIndex < endTileIndex; baseIndex += get_global_size(0)) {
// Identify the pair of blocks to compare.
int index = baseIndex+get_local_id(0);
if (index < END_TILE_INDEX) {
if (index < endTileIndex) {
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index));
unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
......
......@@ -123,12 +123,13 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int*
*/
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* blockCenter,
__global float4* blockBoundingBox, __global unsigned int* interactionCount, __global ushort2* interactingTiles,
__global unsigned int* interactionFlags, __global float4* posq, unsigned int maxTiles) {
__global unsigned int* interactionFlags, __global float4* posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) {
ushort2 buffer[BUFFER_SIZE];
int valuesInBuffer = 0;
const int numTiles = END_TILE_INDEX-START_TILE_INDEX;
unsigned int start = START_TILE_INDEX+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = START_TILE_INDEX+(get_group_id(0)+1)*numTiles/get_num_groups(0);
const int numTiles = endTileIndex-startTileIndex;
unsigned int start = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
for (int index = start; index < end; index++) {
// Identify the pair of blocks to compare.
......
......@@ -13,6 +13,7 @@ typedef struct {
__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else
......@@ -21,11 +22,11 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
PARAMETER_ARGUMENTS) {
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = (numTiles > maxTiles ? START_TILE_INDEX+get_group_id(0)*(END_TILE_INDEX-START_TILE_INDEX)/get_num_groups(0) : get_group_id(0)*numTiles/get_num_groups(0));
unsigned int end = (numTiles > maxTiles ? START_TILE_INDEX+(get_group_id(0)+1)*(END_TILE_INDEX-START_TILE_INDEX)/get_num_groups(0) : (get_group_id(0)+1)*numTiles/get_num_groups(0));
unsigned int pos = (numTiles > maxTiles ? startTileIndex+get_group_id(0)*(endTileIndex-startTileIndex)/get_num_groups(0) : get_group_id(0)*numTiles/get_num_groups(0));
unsigned int end = (numTiles > maxTiles ? startTileIndex+(get_group_id(0)+1)*(endTileIndex-startTileIndex)/get_num_groups(0) : (get_group_id(0)+1)*numTiles/get_num_groups(0));
#else
unsigned int pos = START_TILE_INDEX+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = START_TILE_INDEX+(get_group_id(0)+1)*numTiles/get_num_groups(0);
unsigned int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
......
......@@ -14,6 +14,7 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else
......@@ -22,11 +23,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
PARAMETER_ARGUMENTS) {
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = (numTiles > maxTiles ? START_TILE_INDEX+get_group_id(0)*(END_TILE_INDEX-START_TILE_INDEX)/get_num_groups(0) : get_group_id(0)*numTiles/get_num_groups(0));
unsigned int end = (numTiles > maxTiles ? START_TILE_INDEX+(get_group_id(0)+1)*(END_TILE_INDEX-START_TILE_INDEX)/get_num_groups(0) : (get_group_id(0)+1)*numTiles/get_num_groups(0));
unsigned int pos = (numTiles > maxTiles ? startTileIndex+get_group_id(0)*(endTileIndex-startTileIndex)/get_num_groups(0) : get_group_id(0)*numTiles/get_num_groups(0));
unsigned int end = (numTiles > maxTiles ? startTileIndex+(get_group_id(0)+1)*(endTileIndex-startTileIndex)/get_num_groups(0) : (get_group_id(0)+1)*numTiles/get_num_groups(0));
#else
unsigned int pos = START_TILE_INDEX+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = START_TILE_INDEX+(get_group_id(0)+1)*numTiles/get_num_groups(0);
unsigned int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
......
......@@ -14,6 +14,7 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float* tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else
......@@ -24,11 +25,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
unsigned int warp = get_global_id(0)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = (numTiles > maxTiles ? START_TILE_INDEX+warp*(END_TILE_INDEX-START_TILE_INDEX)/totalWarps : warp*numTiles/totalWarps);
unsigned int end = (numTiles > maxTiles ? START_TILE_INDEX+(warp+1)*(END_TILE_INDEX-START_TILE_INDEX)/totalWarps : (warp+1)*numTiles/totalWarps);
unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*(endTileIndex-startTileIndex)/totalWarps : warp*numTiles/totalWarps);
unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*(endTileIndex-startTileIndex)/totalWarps : (warp+1)*numTiles/totalWarps);
#else
unsigned int pos = START_TILE_INDEX+warp*numTiles/totalWarps;
unsigned int end = START_TILE_INDEX+(warp+1)*numTiles/totalWarps;
unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment