Improved performance of CCMA

c0a43bfc · Peter Eastman · 022a0190 · c0a43bfc · c0a43bfc · c0a43bfc
Commit c0a43bfc authored Apr 05, 2013 by Peter Eastman
6 changed files
--- a/platforms/cuda/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda/src/CudaIntegrationUtilities.cpp
@@ -99,7 +99,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
        random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
        ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
-        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL),
+        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL), ccmaConvergedMemory(NULL),
        vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL) {
    // Create workspace arrays.
@@ -468,6 +468,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        ccmaNumAtomConstraints = CudaArray::create<int>(context, numAtoms, "CcmaAtomConstraintsIndex");
        ccmaConstraintMatrixColumn = CudaArray::create<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
        ccmaConverged = CudaArray::create<int>(context, 2, "ccmaConverged");
+        CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP), "Error allocating pinned memory");
+        CHECK_RESULT2(cuMemHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
        vector<int2> atomsVec(ccmaAtoms->getSize());
        vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize());
        vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
@@ -681,6 +683,8 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
        delete ccmaDelta2;
    if (ccmaConverged != NULL)
        delete ccmaConverged;
+    if (ccmaConvergedMemory != NULL)
+        cuMemFreeHost(ccmaConvergedMemory);
    if (vsite2AvgAtoms != NULL)
        delete vsite2AvgAtoms;
    if (vsite2AvgWeights != NULL)
@@ -739,7 +743,7 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
        void* forceArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(),
                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
                &ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConverged->getDevicePointer(),
-                tolPointer, &i};
+                &ccmaConvergedDeviceMemory, tolPointer, &i};
        void* multiplyArgs[] = {&ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
                &ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConverged->getDevicePointer(), &i};
        void* updateArgs[] = {&ccmaNumAtomConstraints->getDevicePointer(), &ccmaAtomConstraints->getDevicePointer(), &ccmaDistance->getDevicePointer(),
@@ -747,18 +751,16 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
                &context.getVelm().getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
                &ccmaConverged->getDevicePointer(), &i};
        const int checkInterval = 4;
-        int* converged = (int*) context.getPinnedBuffer();
+        ccmaConvergedMemory[0] = 0;
        for (i = 0; i < 150; i++) {
            context.executeKernel(ccmaForceKernel, forceArgs, ccmaAtoms->getSize());
-            if ((i+1)%checkInterval == 0) {
-                ccmaConverged->download(converged, false);
+            if ((i+1)%checkInterval == 0)
                CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
-            }
            context.executeKernel(ccmaMultiplyKernel, multiplyArgs, ccmaAtoms->getSize());
            context.executeKernel(ccmaUpdateKernel, updateArgs, context.getNumAtoms());
            if ((i+1)%checkInterval == 0) {
                CHECK_RESULT2(cuEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
-                if (converged[i%2])
+                if (ccmaConvergedMemory[0])
                    break;
            }
        }

--- a/platforms/cuda/src/CudaIntegrationUtilities.h
+++ b/platforms/cuda/src/CudaIntegrationUtilities.h
@@ -141,6 +141,8 @@ private:
    CudaArray* ccmaDelta1;
    CudaArray* ccmaDelta2;
    CudaArray* ccmaConverged;
+    int* ccmaConvergedMemory;
+    CUdeviceptr ccmaConvergedDeviceMemory;
    CUevent ccmaEvent;
    CudaArray* vsite2AvgAtoms;
    CudaArray* vsite2AvgWeights;

--- a/platforms/cuda/src/kernels/integrationUtilities.cu
+++ b/platforms/cuda/src/kernels/integrationUtilities.cu
@@ -598,11 +598,13 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
 * Compute the force applied by each CCMA position constraint.
 */
 extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __restrict__ constraintAtoms, const mixed4* __restrict__ constraintDistance, const mixed4* __restrict__ atomPositions,
-        const mixed* __restrict__ reducedMass, mixed* __restrict__ delta1, int* __restrict__ converged, mixed tol, int iteration) {
+        const mixed* __restrict__ reducedMass, mixed* __restrict__ delta1, int* __restrict__ converged, int* __restrict__ hostConvergedFlag, mixed tol, int iteration) {
    __shared__ int groupConverged;
    if (converged[1-iteration%2]) {
-        if (blockIdx.x == 0 && threadIdx.x == 0)
+        if (blockIdx.x == 0 && threadIdx.x == 0) {
            converged[iteration%2] = 1;
+            hostConvergedFlag[0] = 1;
+        }
        return; // The constraint iteration has already converged.
    }
    if (threadIdx.x == 0)
@@ -639,11 +641,13 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
 * Compute the force applied by each CCMA velocity constraint.
 */
 extern "C" __global__ void computeCCMAVelocityConstraintForce(const int2* __restrict__ constraintAtoms, const mixed4* __restrict__ constraintDistance, const mixed4* __restrict__ atomPositions,
-        const mixed* __restrict__ reducedMass, mixed* __restrict__ delta1, int* __restrict__ converged, mixed tol, int iteration) {
+        const mixed* __restrict__ reducedMass, mixed* __restrict__ delta1, int* __restrict__ converged, int* __restrict__ hostConvergedFlag, mixed tol, int iteration) {
    __shared__ int groupConverged;
    if (converged[1-iteration%2]) {
-        if (blockIdx.x == 0 && threadIdx.x == 0)
+        if (blockIdx.x == 0 && threadIdx.x == 0) {
            converged[iteration%2] = 1;
+            hostConvergedFlag[0] = 1;
+        }
        return; // The constraint iteration has already converged.
    }
    if (threadIdx.x == 0)

--- a/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -98,7 +98,7 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
        posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
        random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
        ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
-        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL),
+        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL), ccmaConvergedHostBuffer(NULL),
        vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false) {
    // Create workspace arrays.
@@ -479,6 +479,11 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
        ccmaNumAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms, "CcmaAtomConstraintsIndex");
        ccmaConstraintMatrixColumn = OpenCLArray::create<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
        ccmaConverged = OpenCLArray::create<cl_int>(context, 2, "CcmaConverged");
+        ccmaConvergedHostBuffer = OpenCLArray::create<cl_int>(context, 1, "CcmaConvergedHostBuffer", CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR);
+        ccmaConvergedHostMemory = (int*) context.getQueue().enqueueMapBuffer(ccmaConvergedHostBuffer->getDeviceBuffer(), CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(cl_int));
+        // Different communication mechanisms give optimal performance on AMD and on NVIDIA.
+        string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
+        ccmaUseDirectBuffer = (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.");
        vector<mm_int2> atomsVec(ccmaAtoms->getSize());
        vector<cl_int> atomConstraintsVec(ccmaAtomConstraints->getSize());
        vector<cl_int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
@@ -720,6 +725,8 @@ OpenCLIntegrationUtilities::~OpenCLIntegrationUtilities() {
        delete ccmaDelta2;
    if (ccmaConverged != NULL)
        delete ccmaConverged;
+    if (ccmaConvergedHostBuffer != NULL)
+        delete ccmaConvergedHostBuffer;
    if (vsite2AvgAtoms != NULL)
        delete vsite2AvgAtoms;
    if (vsite2AvgWeights != NULL)
@@ -814,6 +821,7 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
            ccmaForceKernel.setArg<cl::Buffer>(3, ccmaReducedMass->getDeviceBuffer());
            ccmaForceKernel.setArg<cl::Buffer>(4, ccmaDelta1->getDeviceBuffer());
            ccmaForceKernel.setArg<cl::Buffer>(5, ccmaConverged->getDeviceBuffer());
+            ccmaForceKernel.setArg<cl::Buffer>(6, ccmaConvergedHostBuffer->getDeviceBuffer());
            ccmaMultiplyKernel.setArg<cl::Buffer>(0, ccmaDelta1->getDeviceBuffer());
            ccmaMultiplyKernel.setArg<cl::Buffer>(1, ccmaDelta2->getDeviceBuffer());
            ccmaMultiplyKernel.setArg<cl::Buffer>(2, ccmaConstraintMatrixColumn->getDeviceBuffer());
@@ -829,26 +837,37 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
            ccmaUpdateKernel.setArg<cl::Buffer>(7, ccmaConverged->getDeviceBuffer());
        }
        if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
-            ccmaForceKernel.setArg<cl_double>(6, (cl_double) tol);
+            ccmaForceKernel.setArg<cl_double>(7, (cl_double) tol);
        else
-            ccmaForceKernel.setArg<cl_float>(6, (cl_float) tol);
+            ccmaForceKernel.setArg<cl_float>(7, (cl_float) tol);
        context.executeKernel(ccmaDirectionsKernel, ccmaAtoms->getSize());
        const int checkInterval = 4;
        cl::Event event;
        int* converged = (int*) context.getPinnedBuffer();
+        ccmaConvergedHostMemory[0] = 0;
        for (int i = 0; i < 150; i++) {
-            ccmaForceKernel.setArg<cl_int>(7, i);
+            ccmaForceKernel.setArg<cl_int>(8, i);
            context.executeKernel(ccmaForceKernel, ccmaAtoms->getSize());
-            if ((i+1)%checkInterval == 0)
-                context.getQueue().enqueueReadBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(cl_int), converged, NULL, &event);
+            if ((i+1)%checkInterval == 0) {
+                if (ccmaUseDirectBuffer)
+                    context.getQueue().enqueueMarker(&event);
+                else
+                    context.getQueue().enqueueReadBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(cl_int), converged, NULL, &event);
+            }
            ccmaMultiplyKernel.setArg<cl_int>(5, i);
            context.executeKernel(ccmaMultiplyKernel, ccmaAtoms->getSize());
            ccmaUpdateKernel.setArg<cl_int>(8, i);
            context.executeKernel(ccmaUpdateKernel, context.getNumAtoms());
            if ((i+1)%checkInterval == 0) {
                event.wait();
-                if (converged[i%2])
-                    break;
+                if (ccmaUseDirectBuffer) {
+                    if (ccmaConvergedHostMemory[0])
+                        break;
+                }
+                else {
+                    if (converged[i%2])
+                        break;
+                }
            }
        }
    }

--- a/platforms/opencl/src/OpenCLIntegrationUtilities.h
+++ b/platforms/opencl/src/OpenCLIntegrationUtilities.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2009 Stanford University and the Authors.           *
+ * Portions copyright (c) 2009-2013 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -141,6 +141,8 @@ private:
    OpenCLArray* ccmaDelta1;
    OpenCLArray* ccmaDelta2;
    OpenCLArray* ccmaConverged;
+    int* ccmaConvergedHostMemory;
+    OpenCLArray* ccmaConvergedHostBuffer;
    OpenCLArray* vsite2AvgAtoms;
    OpenCLArray* vsite2AvgWeights;
    OpenCLArray* vsite3AvgAtoms;
@@ -149,7 +151,7 @@ private:
    OpenCLArray* vsiteOutOfPlaneWeights;
    int randomPos;
    int lastSeed, numVsites;
-    bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels;
+    bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels, ccmaUseDirectBuffer;
    struct ShakeCluster;
    struct ConstraintOrderer;
 };

--- a/platforms/opencl/src/kernels/ccma.cl
+++ b/platforms/opencl/src/kernels/ccma.cl
@@ -34,11 +34,13 @@ __kernel void computeConstraintDirections(__global const int2* restrict constrai
 * Compute the force applied by each constraint.
 */
 __kernel void computeConstraintForce(__global const int2* restrict constraintAtoms, __global const mixed4* restrict constraintDistance, __global const mixed4* restrict atomPositions,
-        __global const mixed* restrict reducedMass, __global mixed* restrict delta1, __global int* restrict converged, mixed tol, int iteration) {
+        __global const mixed* restrict reducedMass, __global mixed* restrict delta1, __global int* restrict converged, __global int* restrict hostConvergedFlag, mixed tol, int iteration) {
    __local int groupConverged;
    if (converged[1-iteration%2]) {
-        if (get_global_id(0) == 0)
+        if (get_global_id(0) == 0) {
            converged[iteration%2] = 1;
+            hostConvergedFlag[0] = 1;
+        }
        return; // The constraint iteration has already converged.
    }
    if (get_local_id(0) == 0)