"serialization/src/CustomVolumeForceProxy.cpp" did not exist on "3d750e8517a73f39fce1de63a5f36d26f6a3d38c"
Commit dd352ee5 authored by Peter Eastman's avatar Peter Eastman
Browse files

Added dynamic load balancing between GPUs

parent 6e3526b4
...@@ -233,8 +233,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -233,8 +233,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
if (useCutoff) { if (useCutoff) {
map<string, string> defines; map<string, string> defines;
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks()); defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks());
defines["START_TILE_INDEX"] = OpenCLExpressionUtilities::intToString(startTileIndex);
defines["END_TILE_INDEX"] = OpenCLExpressionUtilities::intToString(startTileIndex+numTiles);
if (forceBufferPerAtomBlock) if (forceBufferPerAtomBlock)
defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1"; defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
if (usePeriodic) if (usePeriodic)
...@@ -256,6 +254,8 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -256,6 +254,8 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(8, context.getPosq().getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(8, context.getPosq().getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(9, interactingTiles->getSize()); findInteractingBlocksKernel.setArg<cl_uint>(9, interactingTiles->getSize());
findInteractingBlocksKernel.setArg<cl_uint>(10, startTileIndex);
findInteractingBlocksKernel.setArg<cl_uint>(11, startTileIndex+numTiles);
if (context.getSIMDWidth() == 32 && !deviceIsCpu) { if (context.getSIMDWidth() == 32 && !deviceIsCpu) {
findInteractionsWithinBlocksKernel = cl::Kernel(interactingBlocksProgram, "findInteractionsWithinBlocks"); findInteractionsWithinBlocksKernel = cl::Kernel(interactingBlocksProgram, "findInteractionsWithinBlocks");
findInteractionsWithinBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff)); findInteractionsWithinBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff));
...@@ -302,8 +302,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() { ...@@ -302,8 +302,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void OpenCLNonbondedUtilities::computeInteractions() { void OpenCLNonbondedUtilities::computeInteractions() {
if (cutoff != -1.0) { if (cutoff != -1.0) {
if (useCutoff) { if (useCutoff) {
forceKernel.setArg<mm_float4>(10, context.getPeriodicBoxSize()); forceKernel.setArg<mm_float4>(12, context.getPeriodicBoxSize());
forceKernel.setArg<mm_float4>(11, context.getInvPeriodicBoxSize()); forceKernel.setArg<mm_float4>(13, context.getInvPeriodicBoxSize());
} }
context.executeKernel(forceKernel, (context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2)*OpenCLContext::TileSize, deviceIsCpu ? 1 : -1); context.executeKernel(forceKernel, (context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2)*OpenCLContext::TileSize, deviceIsCpu ? 1 : -1);
} }
...@@ -325,14 +325,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() { ...@@ -325,14 +325,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
newSize = numTiles; newSize = numTiles;
delete interactingTiles; delete interactingTiles;
interactingTiles = new OpenCLArray<mm_ushort2>(context, newSize, "interactingTiles"); interactingTiles = new OpenCLArray<mm_ushort2>(context, newSize, "interactingTiles");
forceKernel.setArg<cl::Buffer>(8, interactingTiles->getDeviceBuffer()); forceKernel.setArg<cl::Buffer>(10, interactingTiles->getDeviceBuffer());
forceKernel.setArg<cl_uint>(12, newSize); forceKernel.setArg<cl_uint>(14, newSize);
findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(9, newSize); findInteractingBlocksKernel.setArg<cl_uint>(9, newSize);
if (context.getSIMDWidth() == 32 || deviceIsCpu) { if (context.getSIMDWidth() == 32 || deviceIsCpu) {
delete interactionFlags; delete interactionFlags;
interactionFlags = new OpenCLArray<cl_uint>(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags"); interactionFlags = new OpenCLArray<cl_uint>(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags");
forceKernel.setArg<cl::Buffer>(13, interactionFlags->getDeviceBuffer()); forceKernel.setArg<cl::Buffer>(15, interactionFlags->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
...@@ -340,6 +340,19 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() { ...@@ -340,6 +340,19 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
} }
} }
void OpenCLNonbondedUtilities::setTileRange(int startTileIndex, int numTiles) {
this->startTileIndex = startTileIndex;
this->numTiles = numTiles;
if (cutoff == -1.0)
return; // There are no nonbonded interactions in the System.
forceKernel.setArg<cl_uint>(8, startTileIndex);
forceKernel.setArg<cl_uint>(9, startTileIndex+numTiles);
if (useCutoff) {
findInteractingBlocksKernel.setArg<cl_uint>(10, startTileIndex);
findInteractingBlocksKernel.setArg<cl_uint>(11, startTileIndex+numTiles);
}
}
cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector<ParameterInfo>& params, const vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const { cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector<ParameterInfo>& params, const vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const {
map<string, string> replacements; map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = source; replacements["COMPUTE_INTERACTION"] = source;
...@@ -447,8 +460,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -447,8 +460,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms()); defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks()); defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks());
defines["START_TILE_INDEX"] = OpenCLExpressionUtilities::intToString(startTileIndex);
defines["END_TILE_INDEX"] = OpenCLExpressionUtilities::intToString(startTileIndex+numTiles);
string file; string file;
if (deviceIsCpu) if (deviceIsCpu)
file = OpenCLKernelSources::nonbonded_cpu; file = OpenCLKernelSources::nonbonded_cpu;
...@@ -470,6 +481,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -470,6 +481,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel.setArg<cl::Buffer>(index++, exclusionRowIndices->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, exclusionRowIndices->getDeviceBuffer());
kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize*localDataSize : OpenCLContext::ThreadBlockSize*localDataSize), NULL); kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize*localDataSize : OpenCLContext::ThreadBlockSize*localDataSize), NULL);
kernel.setArg(index++, 4*OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL); kernel.setArg(index++, 4*OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
kernel.setArg<cl_uint>(index++, startTileIndex);
kernel.setArg<cl_uint>(index++, startTileIndex+numTiles);
if (useCutoff) { if (useCutoff) {
kernel.setArg<cl::Buffer>(index++, interactingTiles->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, interactingTiles->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactionCount->getDeviceBuffer()); kernel.setArg<cl::Buffer>(index++, interactionCount->getDeviceBuffer());
......
...@@ -190,6 +190,10 @@ public: ...@@ -190,6 +190,10 @@ public:
int getNumTiles() const { int getNumTiles() const {
return numTiles; return numTiles;
} }
/**
* Set the range of tiles that should be processed by this context.
*/
void setTileRange(int startTileIndex, int numTiles);
/** /**
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions * Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
* are assumed to be the same as those for the default interaction Kernel, since this kernel will use * are assumed to be the same as those for the default interaction Kernel, since this kernel will use
......
...@@ -29,6 +29,28 @@ ...@@ -29,6 +29,28 @@
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
/**
* Get the current clock time, measured in microseconds.
*/
#ifdef _MSC_VER
#include <Windows.h>
static long getTime() {
FILETIME ft;
GetSystemTimeAsFileTime(&ft); // 100-nanoseconds since 1-1-1601
ULARGE_INTEGER result;
result.LowPart = ft.dwLowDateTime;
result.HighPart = ft.dwHighDateTime;
return result/10;
}
#else
#include <sys/time.h>
static long getTime() {
struct timeval tod;
gettimeofday(&tod, 0);
return 1000000*tod.tv_sec+tod.tv_usec;
}
#endif
class OpenCLParallelCalcForcesAndEnergyKernel::BeginComputationTask : public OpenCLContext::WorkTask { class OpenCLParallelCalcForcesAndEnergyKernel::BeginComputationTask : public OpenCLContext::WorkTask {
public: public:
BeginComputationTask(ContextImpl& context, OpenCLContext& cl, OpenCLCalcForcesAndEnergyKernel& kernel, BeginComputationTask(ContextImpl& context, OpenCLContext& cl, OpenCLCalcForcesAndEnergyKernel& kernel,
...@@ -52,8 +74,8 @@ private: ...@@ -52,8 +74,8 @@ private:
class OpenCLParallelCalcForcesAndEnergyKernel::FinishComputationTask : public OpenCLContext::WorkTask { class OpenCLParallelCalcForcesAndEnergyKernel::FinishComputationTask : public OpenCLContext::WorkTask {
public: public:
FinishComputationTask(ContextImpl& context, OpenCLContext& cl, OpenCLCalcForcesAndEnergyKernel& kernel, FinishComputationTask(ContextImpl& context, OpenCLContext& cl, OpenCLCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, double& energy) : context(context), cl(cl), kernel(kernel), bool includeForce, bool includeEnergy, double& energy, long& completionTime) : context(context), cl(cl), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) { includeForce(includeForce), includeEnergy(includeEnergy), energy(energy), completionTime(completionTime) {
} }
void execute() { void execute() {
// Execute the kernel, then download forces. // Execute the kernel, then download forces.
...@@ -61,7 +83,7 @@ public: ...@@ -61,7 +83,7 @@ public:
energy += kernel.finishComputation(context, includeForce, includeEnergy); energy += kernel.finishComputation(context, includeForce, includeEnergy);
if (includeForce) if (includeForce)
cl.getForce().download(); cl.getForce().download();
mm_float4 f = cl.getForce()[0]; completionTime = getTime();
} }
private: private:
ContextImpl& context; ContextImpl& context;
...@@ -69,10 +91,11 @@ private: ...@@ -69,10 +91,11 @@ private:
OpenCLCalcForcesAndEnergyKernel& kernel; OpenCLCalcForcesAndEnergyKernel& kernel;
bool includeForce, includeEnergy; bool includeForce, includeEnergy;
double& energy; double& energy;
long& completionTime;
}; };
OpenCLParallelCalcForcesAndEnergyKernel::OpenCLParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, OpenCLPlatform::PlatformData& data) : OpenCLParallelCalcForcesAndEnergyKernel::OpenCLParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, OpenCLPlatform::PlatformData& data) :
CalcForcesAndEnergyKernel(name, platform), data(data) { CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextTiles(data.contexts.size()) {
for (int i = 0; i < (int) data.contexts.size(); i++) for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new OpenCLCalcForcesAndEnergyKernel(name, platform, *data.contexts[i]))); kernels.push_back(Kernel(new OpenCLCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
} }
...@@ -98,7 +121,7 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c ...@@ -98,7 +121,7 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
for (int i = 0; i < (int) data.contexts.size(); i++) { for (int i = 0; i < (int) data.contexts.size(); i++) {
OpenCLContext& cl = *data.contexts[i]; OpenCLContext& cl = *data.contexts[i];
OpenCLContext::WorkThread& thread = cl.getWorkThread(); OpenCLContext::WorkThread& thread = cl.getWorkThread();
thread.addTask(new FinishComputationTask(context, cl, getKernel(i), includeForce, includeEnergy, data.contextEnergy[i])); thread.addTask(new FinishComputationTask(context, cl, getKernel(i), includeForce, includeEnergy, data.contextEnergy[i], completionTimes[i]));
} }
data.syncContexts(); data.syncContexts();
double energy = 0.0; double energy = 0.0;
...@@ -107,8 +130,6 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c ...@@ -107,8 +130,6 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
if (includeForce) { if (includeForce) {
// Sum the forces from all devices. // Sum the forces from all devices.
for (int i = 0; i < (int) data.contexts.size(); i++)
data.contexts[i]->getForce().download();
OpenCLArray<mm_float4>& forces = data.contexts[0]->getForce(); OpenCLArray<mm_float4>& forces = data.contexts[0]->getForce();
for (int i = 1; i < (int) data.contexts.size(); i++) { for (int i = 1; i < (int) data.contexts.size(); i++) {
OpenCLArray<mm_float4>& contextForces = data.contexts[i]->getForce(); OpenCLArray<mm_float4>& contextForces = data.contexts[i]->getForce();
...@@ -121,6 +142,32 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c ...@@ -121,6 +142,32 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
} }
} }
forces.upload(); forces.upload();
// Balance work between the contexts by transferring a few nonbonded tiles from the context that
// finished last to the one that finished first.
int firstIndex = 0, lastIndex = 0;
int totalTiles = 0;
for (int i = 0; i < (int) completionTimes.size(); i++) {
if (completionTimes[i] < completionTimes[firstIndex])
firstIndex = i;
if (completionTimes[i] > completionTimes[lastIndex])
lastIndex = i;
contextTiles[i] = data.contexts[i]->getNonbondedUtilities().getNumTiles();
totalTiles += contextTiles[i];
}
int tilesToTransfer = totalTiles/1000;
if (tilesToTransfer < 1)
tilesToTransfer = 1;
if (tilesToTransfer > contextTiles[lastIndex])
tilesToTransfer = contextTiles[lastIndex];
contextTiles[firstIndex] += tilesToTransfer;
contextTiles[lastIndex] -= tilesToTransfer;
int startIndex = 0;
for (int i = 0; i < (int) contextTiles.size(); i++) {
data.contexts[i]->getNonbondedUtilities().setTileRange(startIndex, contextTiles[i]);
startIndex += contextTiles[i];
}
} }
return energy; return energy;
} }
......
...@@ -76,6 +76,8 @@ private: ...@@ -76,6 +76,8 @@ private:
class FinishComputationTask; class FinishComputationTask;
OpenCLPlatform::PlatformData& data; OpenCLPlatform::PlatformData& data;
std::vector<Kernel> kernels; std::vector<Kernel> kernels;
std::vector<long> completionTimes;
std::vector<int> contextTiles;
}; };
/** /**
......
...@@ -159,7 +159,8 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s ...@@ -159,7 +159,8 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
*/ */
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* blockCenter, __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* blockCenter,
__global float4* blockBoundingBox, __global unsigned int* interactionCount, __global ushort2* interactingTiles, __global float4* blockBoundingBox, __global unsigned int* interactionCount, __global ushort2* interactingTiles,
__global unsigned int* interactionFlags, __global float4* posq, unsigned int maxTiles) { __global unsigned int* interactionFlags, __global float4* posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) {
__local ushort2 buffer[BUFFER_SIZE]; __local ushort2 buffer[BUFFER_SIZE];
__local int valid[BUFFER_SIZE]; __local int valid[BUFFER_SIZE];
__local short sum[BUFFER_SIZE]; __local short sum[BUFFER_SIZE];
...@@ -172,11 +173,11 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox ...@@ -172,11 +173,11 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
for (int i = 0; i < BUFFER_GROUPS; ++i) for (int i = 0; i < BUFFER_GROUPS; ++i)
valid[i*GROUP_SIZE+get_local_id(0)] = false; valid[i*GROUP_SIZE+get_local_id(0)] = false;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int baseIndex = START_TILE_INDEX+get_group_id(0)*get_local_size(0); baseIndex < END_TILE_INDEX; baseIndex += get_global_size(0)) { for (int baseIndex = startTileIndex+get_group_id(0)*get_local_size(0); baseIndex < endTileIndex; baseIndex += get_global_size(0)) {
// Identify the pair of blocks to compare. // Identify the pair of blocks to compare.
int index = baseIndex+get_local_id(0); int index = baseIndex+get_local_id(0);
if (index < END_TILE_INDEX) { if (index < endTileIndex) {
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index)); unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index));
unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2); unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
......
...@@ -123,12 +123,13 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int* ...@@ -123,12 +123,13 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int*
*/ */
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* blockCenter, __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* blockCenter,
__global float4* blockBoundingBox, __global unsigned int* interactionCount, __global ushort2* interactingTiles, __global float4* blockBoundingBox, __global unsigned int* interactionCount, __global ushort2* interactingTiles,
__global unsigned int* interactionFlags, __global float4* posq, unsigned int maxTiles) { __global unsigned int* interactionFlags, __global float4* posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) {
ushort2 buffer[BUFFER_SIZE]; ushort2 buffer[BUFFER_SIZE];
int valuesInBuffer = 0; int valuesInBuffer = 0;
const int numTiles = END_TILE_INDEX-START_TILE_INDEX; const int numTiles = endTileIndex-startTileIndex;
unsigned int start = START_TILE_INDEX+get_group_id(0)*numTiles/get_num_groups(0); unsigned int start = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = START_TILE_INDEX+(get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
for (int index = start; index < end; index++) { for (int index = start; index < end; index++) {
// Identify the pair of blocks to compare. // Identify the pair of blocks to compare.
......
...@@ -13,6 +13,7 @@ typedef struct { ...@@ -13,6 +13,7 @@ typedef struct {
__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, __kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else #else
...@@ -21,11 +22,11 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -21,11 +22,11 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
unsigned int pos = (numTiles > maxTiles ? START_TILE_INDEX+get_group_id(0)*(END_TILE_INDEX-START_TILE_INDEX)/get_num_groups(0) : get_group_id(0)*numTiles/get_num_groups(0)); unsigned int pos = (numTiles > maxTiles ? startTileIndex+get_group_id(0)*(endTileIndex-startTileIndex)/get_num_groups(0) : get_group_id(0)*numTiles/get_num_groups(0));
unsigned int end = (numTiles > maxTiles ? START_TILE_INDEX+(get_group_id(0)+1)*(END_TILE_INDEX-START_TILE_INDEX)/get_num_groups(0) : (get_group_id(0)+1)*numTiles/get_num_groups(0)); unsigned int end = (numTiles > maxTiles ? startTileIndex+(get_group_id(0)+1)*(endTileIndex-startTileIndex)/get_num_groups(0) : (get_group_id(0)+1)*numTiles/get_num_groups(0));
#else #else
unsigned int pos = START_TILE_INDEX+get_group_id(0)*numTiles/get_num_groups(0); unsigned int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = START_TILE_INDEX+(get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif #endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
......
...@@ -14,6 +14,7 @@ typedef struct { ...@@ -14,6 +14,7 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float4* tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else #else
...@@ -22,11 +23,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -22,11 +23,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
unsigned int pos = (numTiles > maxTiles ? START_TILE_INDEX+get_group_id(0)*(END_TILE_INDEX-START_TILE_INDEX)/get_num_groups(0) : get_group_id(0)*numTiles/get_num_groups(0)); unsigned int pos = (numTiles > maxTiles ? startTileIndex+get_group_id(0)*(endTileIndex-startTileIndex)/get_num_groups(0) : get_group_id(0)*numTiles/get_num_groups(0));
unsigned int end = (numTiles > maxTiles ? START_TILE_INDEX+(get_group_id(0)+1)*(END_TILE_INDEX-START_TILE_INDEX)/get_num_groups(0) : (get_group_id(0)+1)*numTiles/get_num_groups(0)); unsigned int end = (numTiles > maxTiles ? startTileIndex+(get_group_id(0)+1)*(endTileIndex-startTileIndex)/get_num_groups(0) : (get_group_id(0)+1)*numTiles/get_num_groups(0));
#else #else
unsigned int pos = START_TILE_INDEX+get_group_id(0)*numTiles/get_num_groups(0); unsigned int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = START_TILE_INDEX+(get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif #endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
......
...@@ -14,6 +14,7 @@ typedef struct { ...@@ -14,6 +14,7 @@ typedef struct {
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions, void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float* tempBuffer, __global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float* tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags __global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else #else
...@@ -24,11 +25,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe ...@@ -24,11 +25,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
unsigned int warp = get_global_id(0)/TILE_SIZE; unsigned int warp = get_global_id(0)/TILE_SIZE;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
unsigned int pos = (numTiles > maxTiles ? START_TILE_INDEX+warp*(END_TILE_INDEX-START_TILE_INDEX)/totalWarps : warp*numTiles/totalWarps); unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*(endTileIndex-startTileIndex)/totalWarps : warp*numTiles/totalWarps);
unsigned int end = (numTiles > maxTiles ? START_TILE_INDEX+(warp+1)*(END_TILE_INDEX-START_TILE_INDEX)/totalWarps : (warp+1)*numTiles/totalWarps); unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*(endTileIndex-startTileIndex)/totalWarps : (warp+1)*numTiles/totalWarps);
#else #else
unsigned int pos = START_TILE_INDEX+warp*numTiles/totalWarps; unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
unsigned int end = START_TILE_INDEX+(warp+1)*numTiles/totalWarps; unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif #endif
float energy = 0.0f; float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment