Commit 62f7c44c authored by peastman's avatar peastman
Browse files

Load balancing between GPUs forces the neighbor list to be rebuilt

parent f816d961
...@@ -278,7 +278,7 @@ private: ...@@ -278,7 +278,7 @@ private:
std::string kernelSource; std::string kernelSource;
std::map<std::string, std::string> kernelDefines; std::map<std::string, std::string> kernelDefines;
double cutoff; double cutoff;
bool useCutoff, usePeriodic, anyExclusions, usePadding; bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList;
int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms; int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms;
}; };
......
...@@ -65,7 +65,7 @@ private: ...@@ -65,7 +65,7 @@ private:
CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false), usePadding(true), CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false), usePadding(true),
exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL), interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), nonbondedForceGroup(0) { oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), nonbondedForceGroup(0), forceRebuildNeighborList(true) {
// Decide how many thread blocks to use. // Decide how many thread blocks to use.
string errorMessage = "Error initializing nonbonded utilities"; string errorMessage = "Error initializing nonbonded utilities";
...@@ -322,6 +322,7 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -322,6 +322,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
sortBoxDataArgs.push_back(&oldPositions->getDevicePointer()); sortBoxDataArgs.push_back(&oldPositions->getDevicePointer());
sortBoxDataArgs.push_back(&interactionCount->getDevicePointer()); sortBoxDataArgs.push_back(&interactionCount->getDevicePointer());
sortBoxDataArgs.push_back(&rebuildNeighborList->getDevicePointer()); sortBoxDataArgs.push_back(&rebuildNeighborList->getDevicePointer());
sortBoxDataArgs.push_back(&forceRebuildNeighborList);
findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions"); findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer()); findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer()); findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
...@@ -363,6 +364,7 @@ void CudaNonbondedUtilities::prepareInteractions() { ...@@ -363,6 +364,7 @@ void CudaNonbondedUtilities::prepareInteractions() {
blockSorter->sort(*sortedBlocks); blockSorter->sort(*sortedBlocks);
context.executeKernel(sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms()); context.executeKernel(sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
context.executeKernel(findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256); context.executeKernel(findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
forceRebuildNeighborList = false;
} }
void CudaNonbondedUtilities::computeInteractions() { void CudaNonbondedUtilities::computeInteractions() {
...@@ -419,8 +421,9 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF ...@@ -419,8 +421,9 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF
startBlockIndex = (int) (startFraction*numAtomBlocks); startBlockIndex = (int) (startFraction*numAtomBlocks);
numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex; numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2; int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
startTileIndex = (int) (startFraction*totalTiles);; startTileIndex = (int) (startFraction*totalTiles);
numTiles = (int) (endFraction*totalTiles)-startTileIndex; numTiles = (int) (endFraction*totalTiles)-startTileIndex;
forceRebuildNeighborList = true;
} }
CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) { CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) {
......
...@@ -43,7 +43,7 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, ...@@ -43,7 +43,7 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter, extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter, const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter,
real4* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions, real4* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList) { unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) {
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) { for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
int index = (int) sortedBlock[i].y; int index = (int) sortedBlock[i].y;
sortedBlockCenter[i] = blockCenter[index]; sortedBlockCenter[i] = blockCenter[index];
...@@ -52,7 +52,7 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co ...@@ -52,7 +52,7 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
// Also check whether any atom has moved enough so that we really need to rebuild the neighbor list. // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
bool rebuild = false; bool rebuild = forceRebuild;
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) { for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
real4 delta = oldPositions[i]-posq[i]; real4 delta = oldPositions[i]-posq[i];
if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING) if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
......
...@@ -340,6 +340,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -340,6 +340,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
sortBoxDataKernel.setArg<cl::Buffer>(6, oldPositions->getDeviceBuffer()); sortBoxDataKernel.setArg<cl::Buffer>(6, oldPositions->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(7, interactionCount->getDeviceBuffer()); sortBoxDataKernel.setArg<cl::Buffer>(7, interactionCount->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(8, rebuildNeighborList->getDeviceBuffer()); sortBoxDataKernel.setArg<cl::Buffer>(8, rebuildNeighborList->getDeviceBuffer());
sortBoxDataKernel.setArg<cl_int>(9, true);
findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions"); findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions");
findInteractingBlocksKernel.setArg<cl::Buffer>(5, interactionCount->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(5, interactionCount->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
...@@ -406,6 +407,7 @@ void OpenCLNonbondedUtilities::prepareInteractions() { ...@@ -406,6 +407,7 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
context.executeKernel(sortBoxDataKernel, context.getNumAtoms()); context.executeKernel(sortBoxDataKernel, context.getNumAtoms());
setPeriodicBoxArgs(context, findInteractingBlocksKernel, 0); setPeriodicBoxArgs(context, findInteractingBlocksKernel, 0);
context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms(), interactingBlocksThreadBlockSize); context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms(), interactingBlocksThreadBlockSize);
sortBoxDataKernel.setArg<cl_int>(9, false);
} }
void OpenCLNonbondedUtilities::computeInteractions() { void OpenCLNonbondedUtilities::computeInteractions() {
...@@ -474,6 +476,7 @@ void OpenCLNonbondedUtilities::setAtomBlockRange(double startFraction, double en ...@@ -474,6 +476,7 @@ void OpenCLNonbondedUtilities::setAtomBlockRange(double startFraction, double en
forceKernel.setArg<cl_uint>(6, numTiles); forceKernel.setArg<cl_uint>(6, numTiles);
findInteractingBlocksKernel.setArg<cl_uint>(10, startBlockIndex); findInteractingBlocksKernel.setArg<cl_uint>(10, startBlockIndex);
findInteractingBlocksKernel.setArg<cl_uint>(11, numBlocks); findInteractingBlocksKernel.setArg<cl_uint>(11, numBlocks);
sortBoxDataKernel.setArg<cl_int>(9, true);
} }
} }
......
...@@ -177,6 +177,7 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c ...@@ -177,6 +177,7 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
// Balance work between the contexts by transferring a little nonbonded work from the context that // Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first. // finished last to the one that finished first.
if (cl.getComputeForceCount() < 200) {
int firstIndex = 0, lastIndex = 0; int firstIndex = 0, lastIndex = 0;
for (int i = 0; i < (int) completionTimes.size(); i++) { for (int i = 0; i < (int) completionTimes.size(); i++) {
if (completionTimes[i] < completionTimes[firstIndex]) if (completionTimes[i] < completionTimes[firstIndex])
...@@ -196,6 +197,7 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c ...@@ -196,6 +197,7 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
startFraction = endFraction; startFraction = endFraction;
} }
} }
}
return energy; return energy;
} }
......
...@@ -44,7 +44,7 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri ...@@ -44,7 +44,7 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
__kernel void sortBoxData(__global const real2* restrict sortedBlock, __global const real4* restrict blockCenter, __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global const real4* restrict blockCenter,
__global const real4* restrict blockBoundingBox, __global real4* restrict sortedBlockCenter, __global const real4* restrict blockBoundingBox, __global real4* restrict sortedBlockCenter,
__global real4* restrict sortedBlockBoundingBox, __global const real4* restrict posq, __global const real4* restrict oldPositions, __global real4* restrict sortedBlockBoundingBox, __global const real4* restrict posq, __global const real4* restrict oldPositions,
__global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList) { __global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList, int forceRebuild) {
for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) { for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
int index = (int) sortedBlock[i].y; int index = (int) sortedBlock[i].y;
sortedBlockCenter[i] = blockCenter[index]; sortedBlockCenter[i] = blockCenter[index];
...@@ -53,7 +53,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c ...@@ -53,7 +53,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
// Also check whether any atom has moved enough so that we really need to rebuild the neighbor list. // Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
bool rebuild = false; bool rebuild = forceRebuild;
for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) { for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
real4 delta = oldPositions[i]-posq[i]; real4 delta = oldPositions[i]-posq[i];
if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING) if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment