Optimization to building DIIS matrix

bbbc3160 · peastman · a346a6db · bbbc3160 · bbbc3160
Commit bbbc3160 authored Jul 21, 2014 by peastman
2 changed files
--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -978,7 +978,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
    prevDipoles = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipoles");
    prevDipolesPolar = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipolesPolar");
    prevErrors = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevErrors");
-    diisMatrix = new CudaArray(cu, (MaxPrevDIISDipoles+1)*(MaxPrevDIISDipoles+1), elementSize, "diisMatrix");
+    diisMatrix = new CudaArray(cu, MaxPrevDIISDipoles*MaxPrevDIISDipoles, elementSize, "diisMatrix");
    diisCoefficients = new CudaArray(cu, MaxPrevDIISDipoles+1, sizeof(float), "diisMatrix");
    cu.addAutoclearBuffer(*field);
    cu.addAutoclearBuffer(*fieldPolar);
@@ -1625,33 +1625,37 @@ double CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
        void* recordDIISDipolesGkArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &gkKernel->getField()->getDevicePointer(), &gkKernel->getInducedField()->getDevicePointer(),
            &gkKernel->getInducedFieldPolar()->getDevicePointer(), &gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(), 
            &polarizability->getDevicePointer(), &inducedDipoleErrors->getDevicePointer(), &prevDipolesGk->getDevicePointer(),
-            &prevDipolesGkPolar->getDevicePointer(), &prevErrors->getDevicePointer(), &iteration, &falseValue};
+            &prevDipolesGkPolar->getDevicePointer(), &prevErrors->getDevicePointer(), &iteration, &falseValue, &diisMatrix->getDevicePointer()};
        cu.executeKernel(recordDIISDipolesKernel, recordDIISDipolesGkArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
    }
    void* recordDIISDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &npt, &inducedField->getDevicePointer(),
        &inducedFieldPolar->getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
        &polarizability->getDevicePointer(), &inducedDipoleErrors->getDevicePointer(), &prevDipoles->getDevicePointer(),
-        &prevDipolesPolar->getDevicePointer(), &prevErrors->getDevicePointer(), &iteration, &trueValue};
+        &prevDipolesPolar->getDevicePointer(), &prevErrors->getDevicePointer(), &iteration, &trueValue, &diisMatrix->getDevicePointer()};
    cu.executeKernel(recordDIISDipolesKernel, recordDIISDipolesArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
    float2* errors = (float2*) cu.getPinnedBuffer();
    inducedDipoleErrors->download(errors, false);
    
    // Determine the coefficients for selecting the new dipoles.
    
-    vector<float> coefficients(MaxPrevDIISDipoles);
    int numPrev = (iteration+1 < MaxPrevDIISDipoles ? iteration+1 : MaxPrevDIISDipoles);
+    void* buildMatrixArgs[] = {&prevErrors->getDevicePointer(), &iteration, &diisMatrix->getDevicePointer()};
+    int threadBlocks = min(numPrev, cu.getNumThreadBlocks());
+    cu.executeKernel(buildMatrixKernel, buildMatrixArgs, threadBlocks*128, 128, 128*elementSize);
+    vector<float> coefficients(MaxPrevDIISDipoles);
    if (iteration == 0)
        coefficients[0] = 1;
    else {
-        void* buildMatrixArgs[] = {&prevErrors->getDevicePointer(), &iteration, &diisMatrix->getDevicePointer()};
-        cu.executeKernel(buildMatrixKernel, buildMatrixArgs, cu.getNumThreadBlocks()*128, 128, 128*elementSize);
        vector<float> matrix;
        diisMatrix->download(matrix);
        int rank = numPrev+1;
        Array2D<double> b(rank, rank);
-        for (int i = 0; i < rank; i++)
-            for (int j = 0; j < rank; j++)
-                b[i][j] = matrix[i*rank+j];
+        b[0][0] = 0;
+        for (int i = 1; i < rank; i++)
+            b[i][0] = b[0][i] = -1;
+        for (int i = 0; i < numPrev; i++)
+            for (int j = 0; j < numPrev; j++)
+                b[i+1][j+1] = matrix[i*MaxPrevDIISDipoles+j];

        // Solve using SVD.  Since the right hand side is (-1, 0, 0, 0, ...), this is simpler than the general case.


--- a/plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
@@ -499,7 +499,7 @@ extern "C" __global__ void updateInducedFieldBySOR(const long long* __restrict__
 extern "C" __global__ void recordInducedDipolesForDIIS(const long long* __restrict__ fixedField, const long long* __restrict__ fixedFieldPolar,
        const long long* __restrict__ fixedFieldS, const long long* __restrict__ inducedField, const long long* __restrict__ inducedFieldPolar,
        const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, const float* __restrict__ polarizability, float2* __restrict__ errors,
-        real* __restrict__ prevDipoles, real* __restrict__ prevDipolesPolar, real* __restrict__ prevErrors, int iteration, bool recordPrevErrors) {
+        real* __restrict__ prevDipoles, real* __restrict__ prevDipolesPolar, real* __restrict__ prevErrors, int iteration, bool recordPrevErrors, real* __restrict__ matrix) {
    extern __shared__ real2 buffer[];
 #ifdef USE_EWALD
    const real ewaldScale = (4/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI;
@@ -557,29 +557,27 @@ extern "C" __global__ void recordInducedDipolesForDIIS(const long long* __restri
    }
    if (threadIdx.x == 0)
        errors[blockIdx.x] = make_float2((float) buffer[0].x, (float) buffer[0].y);
+    
+    if (iteration >= MAX_PREV_DIIS_DIPOLES && recordPrevErrors && blockIdx.x == 0) {
+        // Shift over the existing matrix elements.
+        
+        for (int i = 0; i < MAX_PREV_DIIS_DIPOLES-1; i++) {
+            if (threadIdx.x < MAX_PREV_DIIS_DIPOLES-1)
+                matrix[threadIdx.x+i*MAX_PREV_DIIS_DIPOLES] = matrix[(threadIdx.x+1)+(i+1)*MAX_PREV_DIIS_DIPOLES];
+            __syncthreads();
+        }
+    }
 }

 extern "C" __global__ void computeDIISMatrix(real* __restrict__ prevErrors, int iteration, real* __restrict__ matrix) {
    extern __shared__ real sumBuffer[];
-    int rank = min(iteration+1, MAX_PREV_DIIS_DIPOLES)+1;
-    for (int element = blockIdx.x; element < rank*rank; element += gridDim.x) {
+    int j = min(iteration, MAX_PREV_DIIS_DIPOLES-1);
+    for (int i = blockIdx.x; i <= j; i += gridDim.x) {
        // All the threads in this thread block work together to compute a single matrix element.

-        int i = element/rank;
-        int j = element-i*rank;
-        if (i > j)
-            continue;
-        real value;
-        if (i == 0 && j == 0)
-            value = 0;
-        else if (i == 0 || j == 0)
-            value = -1;
-        else {
-            // Compute the inner product of the two error vectors.
-            
        real sum = 0;
        for (int index = threadIdx.x; index < NUM_ATOMS*3; index += blockDim.x)
-                sum += prevErrors[index+(i-1)*NUM_ATOMS*3]*prevErrors[index+(j-1)*NUM_ATOMS*3];
+            sum += prevErrors[index+i*NUM_ATOMS*3]*prevErrors[index+j*NUM_ATOMS*3];
        sumBuffer[threadIdx.x] = sum;
        __syncthreads();
        for (int offset = 1; offset < blockDim.x; offset *= 2) { 
@@ -587,13 +585,10 @@ extern "C" __global__ void computeDIISMatrix(real* __restrict__ prevErrors, int
                sumBuffer[threadIdx.x] += sumBuffer[threadIdx.x+offset];
            __syncthreads();
        }
-            value = sumBuffer[0];
-        }
-        __syncthreads();
        if (threadIdx.x == 0) {
-            matrix[element] = value;
+            matrix[i+MAX_PREV_DIIS_DIPOLES*j] = sumBuffer[0];
            if (i != j)
-                matrix[j*rank+i] = value;
+                matrix[j+MAX_PREV_DIIS_DIPOLES*i] = sumBuffer[0];
        }
    }
 }