Commit f7ef2dd0 authored by peastman's avatar peastman
Browse files

More bug fixes to multi-GPU

parent bd666c27
...@@ -264,14 +264,6 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -264,14 +264,6 @@ void CudaNonbondedUtilities::initialize(const System& system) {
sortedBlockCenter = new CudaArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter"); sortedBlockCenter = new CudaArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
sortedBlockBoundingBox = new CudaArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox"); sortedBlockBoundingBox = new CudaArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
oldPositions = new CudaArray(context, numAtoms, 4*elementSize, "oldPositions"); oldPositions = new CudaArray(context, numAtoms, 4*elementSize, "oldPositions");
if (context.getUseDoublePrecision()) {
vector<double4> oldPositionsVec(numAtoms, make_double4(1e30, 1e30, 1e30, 0));
oldPositions->upload(oldPositionsVec);
}
else {
vector<float4> oldPositionsVec(numAtoms, make_float4(1e30f, 1e30f, 1e30f, 0));
oldPositions->upload(oldPositionsVec);
}
rebuildNeighborList = CudaArray::create<int>(context, 1, "rebuildNeighborList"); rebuildNeighborList = CudaArray::create<int>(context, 1, "rebuildNeighborList");
blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks); blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
vector<unsigned int> count(1, 0); vector<unsigned int> count(1, 0);
...@@ -402,14 +394,7 @@ void CudaNonbondedUtilities::updateNeighborListSize() { ...@@ -402,14 +394,7 @@ void CudaNonbondedUtilities::updateNeighborListSize() {
if (forceArgs.size() > 0) if (forceArgs.size() > 0)
forceArgs[17] = &interactingAtoms->getDevicePointer(); forceArgs[17] = &interactingAtoms->getDevicePointer();
findInteractingBlocksArgs[7] = &interactingAtoms->getDevicePointer(); findInteractingBlocksArgs[7] = &interactingAtoms->getDevicePointer();
if (context.getUseDoublePrecision()) { forceRebuildNeighborList = true;
vector<double4> oldPositionsVec(numAtoms, make_double4(1e30, 1e30, 1e30, 0));
oldPositions->upload(oldPositionsVec);
}
else {
vector<float4> oldPositionsVec(numAtoms, make_float4(1e30f, 1e30f, 1e30f, 0));
oldPositions->upload(oldPositionsVec);
}
} }
void CudaNonbondedUtilities::setUsePadding(bool padding) { void CudaNonbondedUtilities::setUsePadding(bool padding) {
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2013 Stanford University and the Authors. * * Portions copyright (c) 2011-2015 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -93,9 +93,9 @@ private: ...@@ -93,9 +93,9 @@ private:
class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public CudaContext::WorkTask { class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public CudaContext::WorkTask {
public: public:
FinishComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel, FinishComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces, bool& valid) : bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces, bool& valid, int& numTiles) :
context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy), context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid) { completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), numTiles(numTiles) {
} }
void execute() { void execute() {
// Execute the kernel, then download forces. // Execute the kernel, then download forces.
...@@ -120,6 +120,10 @@ public: ...@@ -120,6 +120,10 @@ public:
cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]); cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
} }
} }
if (cu.getNonbondedUtilities().getUsePeriodic() && numTiles > cu.getNonbondedUtilities().getInteractingTiles().getSize()) {
valid = false;
cu.getNonbondedUtilities().updateNeighborListSize();
}
} }
private: private:
ContextImpl& context; ContextImpl& context;
...@@ -132,6 +136,7 @@ private: ...@@ -132,6 +136,7 @@ private:
long long* pinnedMemory; long long* pinnedMemory;
CudaArray& contextForces; CudaArray& contextForces;
bool& valid; bool& valid;
int& numTiles;
}; };
CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) : CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
...@@ -201,16 +206,9 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con ...@@ -201,16 +206,9 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
for (int i = 0; i < (int) data.contexts.size(); i++) { for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i]; CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread(); CudaContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, *contextForces, valid)); thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, *contextForces, valid, tileCounts[i]));
} }
data.syncContexts(); data.syncContexts();
if (data.contexts[0]->getNonbondedUtilities().getUsePeriodic()) {
for (int i = 0; i < (int) tileCounts.size(); i++)
if (tileCounts[i] > data.contexts[i]->getNonbondedUtilities().getInteractingTiles().getSize()) {
valid = false;
data.contexts[i]->getNonbondedUtilities().updateNeighborListSize();
}
}
double energy = 0.0; double energy = 0.0;
for (int i = 0; i < (int) data.contextEnergy.size(); i++) for (int i = 0; i < (int) data.contextEnergy.size(); i++)
energy += data.contextEnergy[i]; energy += data.contextEnergy[i];
......
...@@ -282,14 +282,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) { ...@@ -282,14 +282,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
sortedBlockCenter = new OpenCLArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter"); sortedBlockCenter = new OpenCLArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
sortedBlockBoundingBox = new OpenCLArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox"); sortedBlockBoundingBox = new OpenCLArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
oldPositions = new OpenCLArray(context, numAtoms, 4*elementSize, "oldPositions"); oldPositions = new OpenCLArray(context, numAtoms, 4*elementSize, "oldPositions");
if (context.getUseDoublePrecision()) {
vector<mm_double4> oldPositionsVec(numAtoms, mm_double4(1e30, 1e30, 1e30, 0));
oldPositions->upload(oldPositionsVec);
}
else {
vector<mm_float4> oldPositionsVec(numAtoms, mm_float4(1e30f, 1e30f, 1e30f, 0));
oldPositions->upload(oldPositionsVec);
}
rebuildNeighborList = OpenCLArray::create<int>(context, 1, "rebuildNeighborList"); rebuildNeighborList = OpenCLArray::create<int>(context, 1, "rebuildNeighborList");
blockSorter = new OpenCLSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks); blockSorter = new OpenCLSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
vector<cl_uint> count(1, 0); vector<cl_uint> count(1, 0);
...@@ -447,15 +439,7 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() { ...@@ -447,15 +439,7 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactingAtoms->getDeviceBuffer()); findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactingAtoms->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(9, maxTiles); findInteractingBlocksKernel.setArg<cl_uint>(9, maxTiles);
int numAtoms = context.getNumAtoms(); sortBoxDataKernel.setArg<cl_int>(9, true);
if (context.getUseDoublePrecision()) {
vector<mm_double4> oldPositionsVec(numAtoms, mm_double4(1e30, 1e30, 1e30, 0));
oldPositions->upload(oldPositionsVec);
}
else {
vector<mm_float4> oldPositionsVec(numAtoms, mm_float4(1e30f, 1e30f, 1e30f, 0));
oldPositions->upload(oldPositionsVec);
}
} }
void OpenCLNonbondedUtilities::setUsePadding(bool padding) { void OpenCLNonbondedUtilities::setUsePadding(bool padding) {
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2013 Stanford University and the Authors. * * Portions copyright (c) 2011-2015 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -79,9 +79,9 @@ private: ...@@ -79,9 +79,9 @@ private:
class OpenCLParallelCalcForcesAndEnergyKernel::FinishComputationTask : public OpenCLContext::WorkTask { class OpenCLParallelCalcForcesAndEnergyKernel::FinishComputationTask : public OpenCLContext::WorkTask {
public: public:
FinishComputationTask(ContextImpl& context, OpenCLContext& cl, OpenCLCalcForcesAndEnergyKernel& kernel, FinishComputationTask(ContextImpl& context, OpenCLContext& cl, OpenCLCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, void* pinnedMemory, bool& valid) : bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, void* pinnedMemory, bool& valid, int& numTiles) :
context(context), cl(cl), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy), context(context), cl(cl), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
completionTime(completionTime), pinnedMemory(pinnedMemory), valid(valid) { completionTime(completionTime), pinnedMemory(pinnedMemory), valid(valid), numTiles(numTiles) {
} }
void execute() { void execute() {
// Execute the kernel, then download forces. // Execute the kernel, then download forces.
...@@ -98,6 +98,10 @@ public: ...@@ -98,6 +98,10 @@ public:
cl.getQueue().finish(); cl.getQueue().finish();
} }
completionTime = getTime(); completionTime = getTime();
if (cl.getNonbondedUtilities().getUsePeriodic() && numTiles > cl.getNonbondedUtilities().getInteractingTiles().getSize()) {
valid = false;
cl.getNonbondedUtilities().updateNeighborListSize();
}
} }
private: private:
ContextImpl& context; ContextImpl& context;
...@@ -109,6 +113,7 @@ private: ...@@ -109,6 +113,7 @@ private:
long long& completionTime; long long& completionTime;
void* pinnedMemory; void* pinnedMemory;
bool& valid; bool& valid;
int& numTiles;
}; };
OpenCLParallelCalcForcesAndEnergyKernel::OpenCLParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, OpenCLPlatform::PlatformData& data) : OpenCLParallelCalcForcesAndEnergyKernel::OpenCLParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, OpenCLPlatform::PlatformData& data) :
...@@ -162,16 +167,9 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c ...@@ -162,16 +167,9 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
for (int i = 0; i < (int) data.contexts.size(); i++) { for (int i = 0; i < (int) data.contexts.size(); i++) {
OpenCLContext& cl = *data.contexts[i]; OpenCLContext& cl = *data.contexts[i];
OpenCLContext::WorkThread& thread = cl.getWorkThread(); OpenCLContext::WorkThread& thread = cl.getWorkThread();
thread.addTask(new FinishComputationTask(context, cl, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceMemory, valid)); thread.addTask(new FinishComputationTask(context, cl, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceMemory, valid, tileCounts[i]));
} }
data.syncContexts(); data.syncContexts();
if (data.contexts[0]->getNonbondedUtilities().getUsePeriodic()) {
for (int i = 0; i < (int) tileCounts.size(); i++)
if (tileCounts[i] > data.contexts[i]->getNonbondedUtilities().getInteractingTiles().getSize()) {
valid = false;
data.contexts[i]->getNonbondedUtilities().updateNeighborListSize();
}
}
double energy = 0.0; double energy = 0.0;
for (int i = 0; i < (int) data.contextEnergy.size(); i++) for (int i = 0; i < (int) data.contextEnergy.size(); i++)
energy += data.contextEnergy[i]; energy += data.contextEnergy[i];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment