Solve DIIS matrix on GPU

805bba93 · Peter Eastman · 388bf972 · 805bba93 · 805bba93 · 805bba93
Commit 805bba93 authored Oct 17, 2016 by Peter Eastman
3 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -52,10 +52,10 @@
 using namespace OpenMM;
 using namespace std;

-#define CHECK_RESULT(result) \
+#define CHECK_RESULT(result, prefix) \
    if (result != CUDA_SUCCESS) { \
        std::stringstream m; \
-        m<<errorMessage<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
+        m<<prefix<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
        throw OpenMMException(m.str());\
    }

@@ -813,7 +813,7 @@ private:
 };

 CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : 
-        CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false), hasInitializedFFT(false), multipolesAreValid(false),
+        CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false), hasInitializedFFT(false), multipolesAreValid(false), hasCreatedEvent(false),
        multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL), labFrameDipoles(NULL), labFrameQuadrupoles(NULL), sphericalDipoles(NULL), sphericalQuadrupoles(NULL),
        fracDipoles(NULL), fracQuadrupoles(NULL), field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL), inducedDipole(NULL),
        diisCoefficients(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), prevDipoles(NULL), prevDipolesPolar(NULL), prevDipolesGk(NULL),
@@ -933,6 +933,8 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
        delete sort;
    if (hasInitializedFFT)
        cufftDestroy(fft);
+    if (hasCreatedEvent)
+        cuEventDestroy(syncEvent);
 }

 void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const AmoebaMultipoleForce& force) {
@@ -1019,6 +1021,8 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        prevErrors = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevErrors");
        diisMatrix = new CudaArray(cu, MaxPrevDIISDipoles*MaxPrevDIISDipoles, elementSize, "diisMatrix");
        diisCoefficients = new CudaArray(cu, MaxPrevDIISDipoles+1, sizeof(float), "diisMatrix");
+        CHECK_RESULT(cuEventCreate(&syncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for AmoebaMultipoleForce");
+        hasCreatedEvent = true;
    }
    else if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
        int numOrders = force.getExtrapolationCoefficients().size();
@@ -1210,6 +1214,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
        updateInducedFieldKernel = cu.getKernel(module, "updateInducedFieldByDIIS");
        recordDIISDipolesKernel = cu.getKernel(module, "recordInducedDipolesForDIIS");
        buildMatrixKernel = cu.getKernel(module, "computeDIISMatrix");
+        solveMatrixKernel = cu.getKernel(module, "solveDIISMatrix");
        initExtrapolatedKernel = cu.getKernel(module, "initExtrapolatedDipoles");
        iterateExtrapolatedKernel = cu.getKernel(module, "iterateExtrapolatedDipoles");
        computeExtrapolatedKernel = cu.getKernel(module, "computeExtrapolatedDipoles");
@@ -1820,6 +1825,7 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
    cu.executeKernel(recordDIISDipolesKernel, recordDIISDipolesArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
    float2* errors = (float2*) cu.getPinnedBuffer();
    inducedDipoleErrors->download(errors, false);
+    cuEventRecord(syncEvent, cu.getCurrentStream());
    
    // Build the DIIS matrix.
    
@@ -1828,15 +1834,15 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
    int threadBlocks = min(numPrev, cu.getNumThreadBlocks());
    int blockSize = 512;
    cu.executeKernel(buildMatrixKernel, buildMatrixArgs, threadBlocks*blockSize, blockSize, blockSize*elementSize);
-    vector<float> matrixf;
-    vector<double> matrix;
-    if (cu.getUseDoublePrecision())
-        diisMatrix->download(matrix);
-    else
-        diisMatrix->download(matrixf);
+    
+    // Solve the matrix.
+
+    void* solveMatrixArgs[] = {&iteration, &diisMatrix->getDevicePointer(), &diisCoefficients->getDevicePointer()};
+    cu.executeKernel(solveMatrixKernel, solveMatrixArgs, 32, 32);
    
    // Determine whether the iteration has converged.
    
+    cuEventSynchronize(syncEvent);
    double total1 = 0.0, total2 = 0.0;
    for (int j = 0; j < inducedDipoleErrors->getSize(); j++) {
        total1 += errors[j].x;
@@ -1844,42 +1850,6 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
    }
    if (48.033324*sqrt(max(total1, total2)/cu.getNumAtoms()) < inducedEpsilon)
        return true;
-
-    // Compute the coefficients for selecting the new dipoles.
-
-    float* coefficients = (float*) cu.getPinnedBuffer();
-    if (iteration == 0)
-        coefficients[0] = 1;
-    else {
-        int rank = numPrev+1;
-        Array2D<double> b(rank, rank);
-        b[0][0] = 0;
-        for (int i = 1; i < rank; i++)
-            b[i][0] = b[0][i] = -1;
-        if (cu.getUseDoublePrecision()) {
-            for (int i = 0; i < numPrev; i++)
-                for (int j = 0; j < numPrev; j++)
-                    b[i+1][j+1] = matrix[i*MaxPrevDIISDipoles+j];
-        }
-        else {
-            for (int i = 0; i < numPrev; i++)
-                for (int j = 0; j < numPrev; j++)
-                    b[i+1][j+1] = matrixf[i*MaxPrevDIISDipoles+j];
-        }
-
-        // Solve using LU.
-
-        JAMA::LU<double> lu(b);
-        Array1D<double> x(rank, 0.0);
-        x[0] = -1.0;
-        Array1D<double> coeff = lu.solve(x);
-        coefficients[rank-1] = 1.0;
-        for (int i = 0; i < rank-1; i++) {
-            coefficients[i] = coeff[i+1];
-            coefficients[rank-1] -= coefficients[i];
-        }
-    }
-    diisCoefficients->upload(coefficients, false);
    
    // Compute the dipoles.
    

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
@@ -408,7 +408,7 @@ private:
    int fixedFieldThreads, inducedFieldThreads, electrostaticsThreads;
    int gridSizeX, gridSizeY, gridSizeZ;
    double alpha, inducedEpsilon;
-    bool usePME, hasQuadrupoles, hasInitializedScaleFactors, hasInitializedFFT, multipolesAreValid;
+    bool usePME, hasQuadrupoles, hasInitializedScaleFactors, hasInitializedFFT, multipolesAreValid, hasCreatedEvent;
    AmoebaMultipoleForce::PolarizationType polarizationType;
    CudaContext& cu;
    const System& system;
@@ -471,9 +471,10 @@ private:
    CUfunction computeMomentsKernel, recordInducedDipolesKernel, computeFixedFieldKernel, computeInducedFieldKernel, updateInducedFieldKernel, electrostaticsKernel, mapTorqueKernel;
    CUfunction pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel;
    CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel;
-    CUfunction recordDIISDipolesKernel, buildMatrixKernel;
+    CUfunction recordDIISDipolesKernel, buildMatrixKernel, solveMatrixKernel;
    CUfunction initExtrapolatedKernel, iterateExtrapolatedKernel, computeExtrapolatedKernel, addExtrapolatedGradientKernel;
    CUfunction pmeTransformMultipolesKernel, pmeTransformPotentialKernel;
+    CUevent syncEvent;
    CudaCalcAmoebaGeneralizedKirkwoodForceKernel* gkKernel;
    static const int PmeOrder = 5;
    static const int MaxPrevDIISDipoles = 20;

--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
@@ -694,6 +694,117 @@ extern "C" __global__ void computeDIISMatrix(real* __restrict__ prevErrors, int
    }
 }

+extern "C" __global__ void solveDIISMatrix(int iteration, const real* __restrict__ matrix, float* __restrict__ coefficients) {
+    __shared__ real b[MAX_PREV_DIIS_DIPOLES+1][MAX_PREV_DIIS_DIPOLES+1];
+    __shared__ real piv[MAX_PREV_DIIS_DIPOLES+1];
+    __shared__ real x[MAX_PREV_DIIS_DIPOLES+1];
+
+    // On the first iteration we don't need to do any calculation.
+    
+    if (iteration == 0) {
+        if (threadIdx.x == 0)
+            coefficients[0] = 1;
+        return;
+    }
+    
+    // Load the matrix.
+    
+    int numPrev = min(iteration+1, MAX_PREV_DIIS_DIPOLES);
+    int rank = numPrev+1;
+    for (int index = threadIdx.x; index < numPrev*numPrev; index += blockDim.x) {
+        int i = index/numPrev;
+        int j = index-i*numPrev;
+        b[i+1][j+1] = matrix[i*MAX_PREV_DIIS_DIPOLES+j];
+    }
+    for (int i = threadIdx.x; i < rank; i += blockDim.x) {
+        b[i][0] = -1;
+        piv[i] = i;
+    }
+    __syncthreads();
+    
+    // Compute the mean absolute value of the values we just loaded.  We use that for preconditioning it,
+    // which is essential for doing the computation in single precision.
+    
+    if (threadIdx.x == 0) {
+        real mean = 0;
+        for (int i = 0; i < numPrev; i++)
+            for (int j = 0; j < numPrev; j++)
+                mean += b[i+1][j+1];
+        mean /= numPrev*numPrev;
+        b[0][0] = 0;
+        for (int i = 1; i < rank; i++)
+            b[0][i] = -mean;
+
+        // Compute the LU decomposition of the matrix.  This code is adapted from JAMA.
+    
+        int pivsign = 1;
+        for (int j = 0; j < rank; j++) {
+            // Apply previous transformations.
+
+            for (int i = 0; i < rank; i++) {
+                // Most of the time is spent in the following dot product.
+
+                int kmax = min(i, j);
+                real s = 0;
+                for (int k = 0; k < kmax; k++)
+                    s += b[i][k] * b[k][j];
+                b[i][j] -= s;
+            }
+
+            // Find pivot and exchange if necessary.
+
+            int p = j;
+            for (int i = j+1; i < rank; i++)
+                if (abs(b[i][j]) > abs(b[p][j]))
+                    p = i;
+            if (p != j) {
+                int k = 0;
+                for (k = 0; k < rank; k++) {
+                    real t = b[p][k];
+                    b[p][k] = b[j][k];
+                    b[j][k] = t;
+                }
+                k = piv[p];
+                piv[p] = piv[j];
+                piv[j] = k;
+                pivsign = -pivsign;
+            }
+
+            // Compute multipliers.
+
+            if ((j < rank) && (b[j][j] != 0))
+                for (int i = j+1; i < rank; i++)
+                    b[i][j] /= b[j][j];
+        }
+
+        // Solve b*Y = X(piv)
+        
+        for (int i = 0; i < rank; i++) 
+            x[i] = (piv[i] == 0 ? -1 : 0);
+        for (int k = 0; k < rank; k++)
+            for (int i = k+1; i < rank; i++)
+                x[i] -= x[k] * b[i][k];
+
+        // Solve U*X = Y;
+        
+        for (int k = rank-1; k >= 0; k--) {
+            x[k] /= b[k][k];
+            for (int i = 0; i < k; i++)
+                x[i] -= x[k] * b[i][k];
+        }
+        
+        // Record the coefficients.
+        
+        real lastCoeff = 1;
+        for (int i = 0; i < rank-1; i++) {
+            real c = x[i+1]*mean;
+            coefficients[i] = c;
+            lastCoeff -= c;
+        }
+        coefficients[rank-1] = lastCoeff;
+    }
+}
+
 extern "C" __global__ void updateInducedFieldByDIIS(real* __restrict__ inducedDipole, real* __restrict__ inducedDipolePolar, 
        const real* __restrict__ prevDipoles, const real* __restrict__ prevDipolesPolar, const float* __restrict__ coefficients, int numPrev) {
    for (int index = blockIdx.x*blockDim.x + threadIdx.x; index < 3*NUM_ATOMS; index += blockDim.x*gridDim.x) {