Optimized LINCS by merging kernels and implementing a syncAllThreads() function.

afd06645 · Peter Eastman · bf7a968c · afd06645 · afd06645 · afd06645
Commit afd06645 authored Apr 16, 2009 by Peter Eastman
8 changed files
--- a/platforms/cuda/cuda-cmake/FindCuda.cmake
+++ b/platforms/cuda/cuda-cmake/FindCuda.cmake
@@ -117,7 +117,7 @@ ELSE(CUDA_BUILD_TYPE MATCHES "Emulation")
 ENDIF(CUDA_BUILD_TYPE MATCHES "Emulation")

 SET(CUDA_BUILD_CUBIN TRUE CACHE BOOL "Generate and parse .cubin files in Device mode.")
-SET(CUDA_NVCC_FLAGS "-maxrregcount=32;-use_fast_math;-O0;-arch=sm_11" CACHE STRING "Semi-colon delimit multiple arguments.")
+SET(CUDA_NVCC_FLAGS "-maxrregcount=32;-use_fast_math;-O0" CACHE STRING "Semi-colon delimit multiple arguments.")

 # Search for the cuda distribution.
 IF(NOT CUDA_INSTALL_PREFIX)

--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -388,7 +388,7 @@ struct cudaGmxSimulation {
    float*          pLincsSolution;                 // Workspace for LINCS
    int*            pLincsAtomConstraints;          // The indices of constraints involving each atom
    int*            pLincsNumAtomConstraints;       // The number of constraints involving each atom
-    unsigned int*   pSyncCounter;                   // Used for global thread synchronization
+    short*          pSyncCounter;                   // Used for global thread synchronization

    // Mutable stuff
    float4*         pPosq;                          // Pointer to atom positions and charges

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -728,9 +728,9 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
    CUDAStream<float>* psLincsSolution = new CUDAStream<float>(numLincs, 1, "LincsSolution");
    gpu->psLincsSolution             = psLincsSolution;
    gpu->sim.pLincsSolution          = psLincsSolution->_pDevData;
-    CUDAStream<unsigned int>* psSyncCounter = new CUDAStream<unsigned int>(2*lincsTerms+2, 1, "SyncCounter");
-    gpu->psSyncCounter                      = psSyncCounter;
-    gpu->sim.pSyncCounter                   = psSyncCounter->_pDevData;
+    CUDAStream<short>* psSyncCounter = new CUDAStream<short>(2*gpu->sim.blocks, 1, "SyncCounter");
+    gpu->psSyncCounter               = psSyncCounter;
+    gpu->sim.pSyncCounter            = psSyncCounter->_pDevData;
    gpu->sim.lincsConstraints = numLincs;
    for (int i = 0; i < numLincs; i++) {
        int c = lincsConstraints[i];
@@ -743,7 +743,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
            (*psLincsConnections)[i+j*numLincs] = linkedConstraints[i][j];
    }
    for (unsigned int i = 0; i < psSyncCounter->_length; i++)
-        (*psSyncCounter)[i] = 0;
+        (*psSyncCounter)[i] = -1;
    for (unsigned int i = 0; i < atomConstraints.size(); i++) {
        (*psLincsNumAtomConstraints)[i] = atomConstraints[i].size();
        for (unsigned int j = 0; j < atomConstraints[i].size(); j++)
@@ -761,8 +761,8 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
    gpu->sim.lincs_threads_per_block = (gpu->sim.lincsConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
    if (gpu->sim.lincs_threads_per_block > gpu->sim.max_shake_threads_per_block)
        gpu->sim.lincs_threads_per_block = gpu->sim.max_shake_threads_per_block;
-    if (gpu->sim.lincs_threads_per_block < 1)
-        gpu->sim.lincs_threads_per_block = 1;
+    if (gpu->sim.lincs_threads_per_block < gpu->sim.blocks)
+        gpu->sim.lincs_threads_per_block = gpu->sim.blocks;


    gpu->psLincsNumConnections->Download();

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -140,7 +140,7 @@ struct _gpuContext {
    CUDAStream<float>* psLincsRhs1;         // Workspace for LINCS
    CUDAStream<float>* psLincsRhs2;         // Workspace for LINCS
    CUDAStream<float>* psLincsSolution;     // Workspace for LINCS
-    CUDAStream<unsigned int>* psSyncCounter;// Used for global thread synchronization
+    CUDAStream<short>* psSyncCounter;       // Used for global thread synchronization
 };

 typedef struct _gpuContext *gpuContext;

--- a/platforms/cuda/src/kernels/kLincs.cu
+++ b/platforms/cuda/src/kernels/kLincs.cu
@@ -52,8 +52,48 @@ void GetLincsSim(gpuContext gpu)
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
 }

-__global__ void kUpdateAtomPositions_kernel(float4* atomPositions)
+/**
+ * Synchronize all threads across all blocks.
+ */
+__device__ void kSyncAllThreads_kernel(short* syncCounter, short newCount)
 {
+//    short* syncCounter = &cSim.pSyncCounter[newCount%2 == 0 ? 0 : gridDim.x];
+    __syncthreads();
+    if (threadIdx.x == 0)
+        syncCounter[blockIdx.x] = newCount;
+    if (threadIdx.x < gridDim.x)
+    {
+        volatile short* counter = &syncCounter[threadIdx.x];
+        do
+        {
+        } while (*counter != newCount);
+    }
+    __syncthreads();
+}
+
+__global__ void kSolveLincsMatrix_kernel(float4* atomPositions)
+{
+    for (unsigned int iteration = 0; iteration < cSim.lincsTerms; iteration++) {
+        float* rhs1 = (iteration%2 == 0 ? cSim.pLincsRhs1 : cSim.pLincsRhs2);
+        float* rhs2 = (iteration%2 == 0 ? cSim.pLincsRhs2 : cSim.pLincsRhs1);
+        unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
+        while (pos < cSim.lincsConstraints)
+        {
+            float rhs = 0.0f;
+            int num = cSim.pLincsNumConnections[pos];
+            for (int i = 0; i < num; i++)
+            {
+                int index = pos+i*cSim.lincsConstraints;
+                int otherConstraint = cSim.pLincsConnections[index];
+                rhs += cSim.pLincsCoupling[index]*rhs1[otherConstraint];
+            }
+            rhs2[pos] = rhs;
+            cSim.pLincsSolution[pos] += rhs;
+            pos += blockDim.x * gridDim.x;
+        }
+        kSyncAllThreads_kernel(&cSim.pSyncCounter[iteration%2 == 0 ? 0 : gridDim.x], iteration);
+    }
+
    // Update the atom positions based on the solution to the matrix equations.

    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
@@ -78,29 +118,6 @@ __global__ void kUpdateAtomPositions_kernel(float4* atomPositions)
    }
 }

-__global__ void kIterateLincsMatrix_kernel(int iteration)
-{
-    // Perform one iteration of inverting the matrix.
-
-    float* rhs1 = (iteration%2 == 0 ? cSim.pLincsRhs1 : cSim.pLincsRhs2);
-    float* rhs2 = (iteration%2 == 0 ? cSim.pLincsRhs2 : cSim.pLincsRhs1);
-    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
-    while (pos < cSim.lincsConstraints)
-    {
-        float rhs = 0.0f;
-        int num = cSim.pLincsNumConnections[pos];
-        for (int i = 0; i < num; i++)
-        {
-            int index = pos+i*cSim.lincsConstraints;
-            int otherConstraint = cSim.pLincsConnections[index];
-            rhs += cSim.pLincsCoupling[index]*rhs1[otherConstraint];
-        }
-        rhs2[pos] = rhs;
-        cSim.pLincsSolution[pos] += rhs;
-        pos += blockDim.x * gridDim.x;
-    }
-}
-
 __global__ void kApplyLincsPart1_kernel(float4* atomPositions, bool addOldPosition)
 {
   // Calculate the direction of each constraint, along with the initial RHS and solution vectors.
@@ -136,13 +153,11 @@ __global__ void kApplyLincsPart1_kernel(float4* atomPositions, bool addOldPositi
        cSim.pLincsSolution[pos] = diff;
        pos += blockDim.x * gridDim.x;
    }
-}
+    kSyncAllThreads_kernel(cSim.pSyncCounter, cSim.lincsTerms+1);

-__global__ void kApplyLincsPart2_kernel()
-{
    // Build the coupling matrix.

-    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
+    pos = threadIdx.x + blockIdx.x * blockDim.x;
    while (pos < cSim.lincsConstraints)
    {
        float4 dir1 = cSim.pLincsDistance[pos];
@@ -163,7 +178,7 @@ __global__ void kApplyLincsPart2_kernel()
    }
 }

-__global__ void kApplyLincsPart3_kernel(float4* atomPositions, bool addOldPosition)
+__global__ void kApplyLincsPart2_kernel(float4* atomPositions, bool addOldPosition)
 {
    // Correct for rotational lengthening.

@@ -200,24 +215,12 @@ static void kApplyLincs(gpuContext gpu, float4* atomPositions, bool addOldPositi
 {
    kApplyLincsPart1_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions, addOldPosition);
    LAUNCHERROR("kApplyLincsPart1");
-    kApplyLincsPart2_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>();
+    kSolveLincsMatrix_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions);
+    LAUNCHERROR("kSolveLincsMatrix_kernel");
+    kApplyLincsPart2_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions, addOldPosition);
    LAUNCHERROR("kApplyLincsPart2");
-    for (int i = 0; i < gpu->sim.lincsTerms; ++i)
-    {
-        kIterateLincsMatrix_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(i);
-        LAUNCHERROR("kIterateLincsMatrix_kernel");
-    }
-    kUpdateAtomPositions_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions);
-    LAUNCHERROR("kUpdateAtomPositions");
-    kApplyLincsPart3_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions, addOldPosition);
-    LAUNCHERROR("kApplyLincsPart3");
-    for (int i = 0; i < gpu->sim.lincsTerms; ++i)
-    {
-        kIterateLincsMatrix_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(i);
-        LAUNCHERROR("kIterateLincsMatrix_kernel");
-    }
-    kUpdateAtomPositions_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions);
-    LAUNCHERROR("kUpdateAtomPositions");
+    kSolveLincsMatrix_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions);
+    LAUNCHERROR("kSolveLincsMatrix_kernel");
 }

 void kApplyFirstLincs(gpuContext gpu)

--- a/platforms/cuda/tests/TestCudaBrownianIntegrator.cpp
+++ b/platforms/cuda/tests/TestCudaBrownianIntegrator.cpp
@@ -165,7 +165,7 @@ void testConstraints() {
            Vec3 p1 = state.getPositions()[particle1];
            Vec3 p2 = state.getPositions()[particle2];
            double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
-            ASSERT_EQUAL_TOL(distance, dist, 2e-5);
+            ASSERT_EQUAL_TOL(distance, dist, 1e-4);
        }
        integrator.step(1);
    }

--- a/platforms/cuda/tests/TestCudaLangevinIntegrator.cpp
+++ b/platforms/cuda/tests/TestCudaLangevinIntegrator.cpp
@@ -170,7 +170,7 @@ void testConstraints() {
            Vec3 p1 = state.getPositions()[particle1];
            Vec3 p2 = state.getPositions()[particle2];
            double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
-            ASSERT_EQUAL_TOL(distance, dist, 2e-5);
+            ASSERT_EQUAL_TOL(distance, dist, 1e-4);
        }
        integrator.step(1);
    }

--- a/platforms/cuda/tests/TestCudaVerletIntegrator.cpp
+++ b/platforms/cuda/tests/TestCudaVerletIntegrator.cpp
@@ -127,7 +127,7 @@ void testConstraints() {
                Vec3 p1 = state.getPositions()[particle1];
                Vec3 p2 = state.getPositions()[particle2];
                double dist = std::sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2]));
-                ASSERT_EQUAL_TOL(distance, dist, 2e-5);
+                ASSERT_EQUAL_TOL(distance, dist, 1e-4);
            }
        double energy = state.getKineticEnergy()+state.getPotentialEnergy();
        if (i == 1)