Commit c0a43bfc authored by Peter Eastman's avatar Peter Eastman
Browse files

Improved performance of CCMA

parent 022a0190
......@@ -99,7 +99,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL),
ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL), ccmaConvergedMemory(NULL),
vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL) {
// Create workspace arrays.
......@@ -468,6 +468,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
ccmaNumAtomConstraints = CudaArray::create<int>(context, numAtoms, "CcmaAtomConstraintsIndex");
ccmaConstraintMatrixColumn = CudaArray::create<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaConverged = CudaArray::create<int>(context, 2, "ccmaConverged");
CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP), "Error allocating pinned memory");
CHECK_RESULT2(cuMemHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
vector<int2> atomsVec(ccmaAtoms->getSize());
vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize());
vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
......@@ -681,6 +683,8 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
delete ccmaDelta2;
if (ccmaConverged != NULL)
delete ccmaConverged;
if (ccmaConvergedMemory != NULL)
cuMemFreeHost(ccmaConvergedMemory);
if (vsite2AvgAtoms != NULL)
delete vsite2AvgAtoms;
if (vsite2AvgWeights != NULL)
......@@ -739,7 +743,7 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
void* forceArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(),
constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
&ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConverged->getDevicePointer(),
tolPointer, &i};
&ccmaConvergedDeviceMemory, tolPointer, &i};
void* multiplyArgs[] = {&ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
&ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConverged->getDevicePointer(), &i};
void* updateArgs[] = {&ccmaNumAtomConstraints->getDevicePointer(), &ccmaAtomConstraints->getDevicePointer(), &ccmaDistance->getDevicePointer(),
......@@ -747,18 +751,16 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
&context.getVelm().getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
&ccmaConverged->getDevicePointer(), &i};
const int checkInterval = 4;
int* converged = (int*) context.getPinnedBuffer();
ccmaConvergedMemory[0] = 0;
for (i = 0; i < 150; i++) {
context.executeKernel(ccmaForceKernel, forceArgs, ccmaAtoms->getSize());
if ((i+1)%checkInterval == 0) {
ccmaConverged->download(converged, false);
if ((i+1)%checkInterval == 0)
CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
}
context.executeKernel(ccmaMultiplyKernel, multiplyArgs, ccmaAtoms->getSize());
context.executeKernel(ccmaUpdateKernel, updateArgs, context.getNumAtoms());
if ((i+1)%checkInterval == 0) {
CHECK_RESULT2(cuEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
if (converged[i%2])
if (ccmaConvergedMemory[0])
break;
}
}
......
......@@ -141,6 +141,8 @@ private:
CudaArray* ccmaDelta1;
CudaArray* ccmaDelta2;
CudaArray* ccmaConverged;
int* ccmaConvergedMemory;
CUdeviceptr ccmaConvergedDeviceMemory;
CUevent ccmaEvent;
CudaArray* vsite2AvgAtoms;
CudaArray* vsite2AvgWeights;
......
......@@ -598,11 +598,13 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
* Compute the force applied by each CCMA position constraint.
*/
extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __restrict__ constraintAtoms, const mixed4* __restrict__ constraintDistance, const mixed4* __restrict__ atomPositions,
const mixed* __restrict__ reducedMass, mixed* __restrict__ delta1, int* __restrict__ converged, mixed tol, int iteration) {
const mixed* __restrict__ reducedMass, mixed* __restrict__ delta1, int* __restrict__ converged, int* __restrict__ hostConvergedFlag, mixed tol, int iteration) {
__shared__ int groupConverged;
if (converged[1-iteration%2]) {
if (blockIdx.x == 0 && threadIdx.x == 0)
if (blockIdx.x == 0 && threadIdx.x == 0) {
converged[iteration%2] = 1;
hostConvergedFlag[0] = 1;
}
return; // The constraint iteration has already converged.
}
if (threadIdx.x == 0)
......@@ -639,11 +641,13 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
* Compute the force applied by each CCMA velocity constraint.
*/
extern "C" __global__ void computeCCMAVelocityConstraintForce(const int2* __restrict__ constraintAtoms, const mixed4* __restrict__ constraintDistance, const mixed4* __restrict__ atomPositions,
const mixed* __restrict__ reducedMass, mixed* __restrict__ delta1, int* __restrict__ converged, mixed tol, int iteration) {
const mixed* __restrict__ reducedMass, mixed* __restrict__ delta1, int* __restrict__ converged, int* __restrict__ hostConvergedFlag, mixed tol, int iteration) {
__shared__ int groupConverged;
if (converged[1-iteration%2]) {
if (blockIdx.x == 0 && threadIdx.x == 0)
if (blockIdx.x == 0 && threadIdx.x == 0) {
converged[iteration%2] = 1;
hostConvergedFlag[0] = 1;
}
return; // The constraint iteration has already converged.
}
if (threadIdx.x == 0)
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -98,7 +98,7 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL),
ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL), ccmaConvergedHostBuffer(NULL),
vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false) {
// Create workspace arrays.
......@@ -479,6 +479,11 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
ccmaNumAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms, "CcmaAtomConstraintsIndex");
ccmaConstraintMatrixColumn = OpenCLArray::create<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaConverged = OpenCLArray::create<cl_int>(context, 2, "CcmaConverged");
ccmaConvergedHostBuffer = OpenCLArray::create<cl_int>(context, 1, "CcmaConvergedHostBuffer", CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR);
ccmaConvergedHostMemory = (int*) context.getQueue().enqueueMapBuffer(ccmaConvergedHostBuffer->getDeviceBuffer(), CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(cl_int));
// Different communication mechanisms give optimal performance on AMD and on NVIDIA.
string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
ccmaUseDirectBuffer = (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.");
vector<mm_int2> atomsVec(ccmaAtoms->getSize());
vector<cl_int> atomConstraintsVec(ccmaAtomConstraints->getSize());
vector<cl_int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
......@@ -720,6 +725,8 @@ OpenCLIntegrationUtilities::~OpenCLIntegrationUtilities() {
delete ccmaDelta2;
if (ccmaConverged != NULL)
delete ccmaConverged;
if (ccmaConvergedHostBuffer != NULL)
delete ccmaConvergedHostBuffer;
if (vsite2AvgAtoms != NULL)
delete vsite2AvgAtoms;
if (vsite2AvgWeights != NULL)
......@@ -814,6 +821,7 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
ccmaForceKernel.setArg<cl::Buffer>(3, ccmaReducedMass->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(4, ccmaDelta1->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(5, ccmaConverged->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(6, ccmaConvergedHostBuffer->getDeviceBuffer());
ccmaMultiplyKernel.setArg<cl::Buffer>(0, ccmaDelta1->getDeviceBuffer());
ccmaMultiplyKernel.setArg<cl::Buffer>(1, ccmaDelta2->getDeviceBuffer());
ccmaMultiplyKernel.setArg<cl::Buffer>(2, ccmaConstraintMatrixColumn->getDeviceBuffer());
......@@ -829,26 +837,37 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
ccmaUpdateKernel.setArg<cl::Buffer>(7, ccmaConverged->getDeviceBuffer());
}
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
ccmaForceKernel.setArg<cl_double>(6, (cl_double) tol);
ccmaForceKernel.setArg<cl_double>(7, (cl_double) tol);
else
ccmaForceKernel.setArg<cl_float>(6, (cl_float) tol);
ccmaForceKernel.setArg<cl_float>(7, (cl_float) tol);
context.executeKernel(ccmaDirectionsKernel, ccmaAtoms->getSize());
const int checkInterval = 4;
cl::Event event;
int* converged = (int*) context.getPinnedBuffer();
ccmaConvergedHostMemory[0] = 0;
for (int i = 0; i < 150; i++) {
ccmaForceKernel.setArg<cl_int>(7, i);
ccmaForceKernel.setArg<cl_int>(8, i);
context.executeKernel(ccmaForceKernel, ccmaAtoms->getSize());
if ((i+1)%checkInterval == 0)
context.getQueue().enqueueReadBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(cl_int), converged, NULL, &event);
if ((i+1)%checkInterval == 0) {
if (ccmaUseDirectBuffer)
context.getQueue().enqueueMarker(&event);
else
context.getQueue().enqueueReadBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(cl_int), converged, NULL, &event);
}
ccmaMultiplyKernel.setArg<cl_int>(5, i);
context.executeKernel(ccmaMultiplyKernel, ccmaAtoms->getSize());
ccmaUpdateKernel.setArg<cl_int>(8, i);
context.executeKernel(ccmaUpdateKernel, context.getNumAtoms());
if ((i+1)%checkInterval == 0) {
event.wait();
if (converged[i%2])
break;
if (ccmaUseDirectBuffer) {
if (ccmaConvergedHostMemory[0])
break;
}
else {
if (converged[i%2])
break;
}
}
}
}
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -141,6 +141,8 @@ private:
OpenCLArray* ccmaDelta1;
OpenCLArray* ccmaDelta2;
OpenCLArray* ccmaConverged;
int* ccmaConvergedHostMemory;
OpenCLArray* ccmaConvergedHostBuffer;
OpenCLArray* vsite2AvgAtoms;
OpenCLArray* vsite2AvgWeights;
OpenCLArray* vsite3AvgAtoms;
......@@ -149,7 +151,7 @@ private:
OpenCLArray* vsiteOutOfPlaneWeights;
int randomPos;
int lastSeed, numVsites;
bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels;
bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels, ccmaUseDirectBuffer;
struct ShakeCluster;
struct ConstraintOrderer;
};
......
......@@ -34,11 +34,13 @@ __kernel void computeConstraintDirections(__global const int2* restrict constrai
* Compute the force applied by each constraint.
*/
__kernel void computeConstraintForce(__global const int2* restrict constraintAtoms, __global const mixed4* restrict constraintDistance, __global const mixed4* restrict atomPositions,
__global const mixed* restrict reducedMass, __global mixed* restrict delta1, __global int* restrict converged, mixed tol, int iteration) {
__global const mixed* restrict reducedMass, __global mixed* restrict delta1, __global int* restrict converged, __global int* restrict hostConvergedFlag, mixed tol, int iteration) {
__local int groupConverged;
if (converged[1-iteration%2]) {
if (get_global_id(0) == 0)
if (get_global_id(0) == 0) {
converged[iteration%2] = 1;
hostConvergedFlag[0] = 1;
}
return; // The constraint iteration has already converged.
}
if (get_local_id(0) == 0)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment