Optimization: use mapped memory to communicate when CCMA is converged

f0c2e89c · Peter Eastman · a42431fd · f0c2e89c · f0c2e89c
Commit f0c2e89c authored Jun 28, 2012 by Peter Eastman
Showing with 9 additions and 14 deletions

platforms/cuda2/src/CudaIntegrationUtilities.cpp platforms/cuda2/src/CudaIntegrationUtilities.cpp +8 -13

platforms/cuda2/src/CudaIntegrationUtilities.h platforms/cuda2/src/CudaIntegrationUtilities.h +1 -1

No files found.
--- a/platforms/cuda2/src/CudaIntegrationUtilities.cpp
+++ b/platforms/cuda2/src/CudaIntegrationUtilities.cpp
@@ -99,8 +99,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
        random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
        ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
-        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL),
-        ccmaConvergedMemory(NULL), vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
+        ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConvergedMemory(NULL),
+        vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
        vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL) {
    // Create workspace arrays.

@@ -466,8 +466,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
        ccmaAtoms = CudaArray::create<int2>(context, numCCMA, "CcmaAtoms");
        ccmaAtomConstraints = CudaArray::create<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
        ccmaNumAtomConstraints = CudaArray::create<int>(context, numAtoms, "CcmaAtomConstraintsIndex");
-        ccmaConverged = CudaArray::create<int>(context, 2, "CcmaConverged");
-        CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, 2*sizeof(int), 0), "Error allocating pinned memory");
+        CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, 2*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP), "Error allocating pinned memory");
+        CHECK_RESULT2(cuMemHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
        ccmaConstraintMatrixColumn = CudaArray::create<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
        vector<int2> atomsVec(ccmaAtoms->getSize());
        vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize());
@@ -680,8 +680,6 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
        delete ccmaDelta1;
    if (ccmaDelta2 != NULL)
        delete ccmaDelta2;
-    if (ccmaConverged != NULL)
-        delete ccmaConverged;
    if (ccmaConvergedMemory != NULL)
        cuMemFreeHost(ccmaConvergedMemory);
    if (vsite2AvgAtoms != NULL)
@@ -739,26 +737,23 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
        int i;
        void* forceArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(),
                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
-                &ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConverged->getDevicePointer(),
+                &ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConvergedDeviceMemory,
                &floatTol, &i};
        void* multiplyArgs[] = {&ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
-                &ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConverged->getDevicePointer(), &i};
+                &ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConvergedDeviceMemory, &i};
        void* updateArgs[] = {&ccmaNumAtomConstraints->getDevicePointer(), &ccmaAtomConstraints->getDevicePointer(), &ccmaDistance->getDevicePointer(),
                constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
                &context.getVelm().getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
-                &ccmaConverged->getDevicePointer(), &i};
+                &ccmaConvergedDeviceMemory, &i};
        const int checkInterval = 4;
        for (i = 0; i < 150; i++) {
            if (i == 0) {
                ccmaConvergedMemory[0] = 1;
                ccmaConvergedMemory[1] = 0;
-                cuMemcpyHtoD(ccmaConverged->getDevicePointer(), ccmaConvergedMemory, 2*sizeof(int));
            }
            context.executeKernel(ccmaForceKernel, forceArgs, ccmaAtoms->getSize());
-            if ((i+1)%checkInterval == 0) {
-                cuMemcpyDtoH(ccmaConvergedMemory, ccmaConverged->getDevicePointer(), 2*sizeof(int));
+            if ((i+1)%checkInterval == 0)
                CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
-            }
            context.executeKernel(ccmaMultiplyKernel, multiplyArgs, ccmaAtoms->getSize());
            context.executeKernel(ccmaUpdateKernel, updateArgs, context.getNumAtoms());
            if ((i+1)%checkInterval == 0) {

--- a/platforms/cuda2/src/CudaIntegrationUtilities.h
+++ b/platforms/cuda2/src/CudaIntegrationUtilities.h
@@ -133,8 +133,8 @@ private:
    CudaArray* ccmaConstraintMatrixValue;
    CudaArray* ccmaDelta1;
    CudaArray* ccmaDelta2;
-    CudaArray* ccmaConverged;
    int* ccmaConvergedMemory;
+    CUdeviceptr ccmaConvergedDeviceMemory;
    CUevent ccmaEvent;
    CudaArray* vsite2AvgAtoms;
    CudaArray* vsite2AvgWeights;