Merge pull request #1582 from peastman/diisopt

Optimizations to DIIS

Merge pull request #1582 from peastman/diisopt
Optimizations to DIIS
e38c6907 · peastman · GitHub · d027c825 · 319018ed · e38c6907
Commit e38c6907 authored Sep 07, 2016 by peastman Committed by GitHub Sep 07, 2016
Showing with 15 additions and 18 deletions

libraries/jama/include/tnt_sparse_matrix_csr.h libraries/jama/include/tnt_sparse_matrix_csr.h +2 -2

plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp +13 -16

No files found.
--- a/libraries/jama/include/tnt_sparse_matrix_csr.h
+++ b/libraries/jama/include/tnt_sparse_matrix_csr.h
@@ -59,7 +59,7 @@ public:
 	Sparse_Matrix_CompRow(const Sparse_Matrix_CompRow &S);
 	Sparse_Matrix_CompRow(int M, int N, int nz, const T *val, 
-						const int *r, const int *c);
+						int *r, int *c);
@@ -93,7 +93,7 @@ public:
 */
 template <class T>
 Sparse_Matrix_CompRow<T>::Sparse_Matrix_CompRow(int M, int N, int nz,
-	const T *val, const int *r, const int *c) : val_(nz,val), 
+	const T *val, int *r, int *c) : val_(nz,val), 
 		rowptr_(M, r), colind_(nz, c), dim1_(M), dim2_(N) {}

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -41,7 +41,7 @@
 #include "CudaForceInfo.h"
 #include "CudaKernelSources.h"
 #include "CudaNonbondedUtilities.h"
-#include "jama_svd.h"
+#include "jama_lu.h"
 #include <algorithm>
 #include <cmath>
@@ -1837,7 +1837,8 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
    int numPrev = (iteration+1 < MaxPrevDIISDipoles ? iteration+1 : MaxPrevDIISDipoles);
    void* buildMatrixArgs[] = {&prevErrors->getDevicePointer(), &iteration, &diisMatrix->getDevicePointer()};
    int threadBlocks = min(numPrev, cu.getNumThreadBlocks());
-    cu.executeKernel(buildMatrixKernel, buildMatrixArgs, threadBlocks*128, 128, 128*elementSize);
+    int blockSize = 512;
+    cu.executeKernel(buildMatrixKernel, buildMatrixArgs, threadBlocks*blockSize, blockSize, blockSize*elementSize);
    vector<float> matrixf;
    vector<double> matrix;
    if (cu.getUseDoublePrecision())
@@ -1877,20 +1878,16 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
                    b[i+1][j+1] = matrixf[i*MaxPrevDIISDipoles+j];
        }
-        // Solve using SVD.  Since the right hand side is (-1, 0, 0, 0, ...), this is simpler than the general case.
+        // Solve using LU.
-        JAMA::SVD<double> svd(b);
+        JAMA::LU<double> lu(b);
-        Array2D<double> u, v;
+        Array1D<double> x(rank, 0.0);
-        svd.getU(u);
+        x[0] = -1.0;
-        svd.getV(v);
+        Array1D<double> coeff = lu.solve(x);
-        Array1D<double> s;
+        coefficients[rank-1] = 1.0;
-        svd.getSingularValues(s);
+        for (int i = 0; i < rank-1; i++) {
-        int effectiveRank = svd.rank();
+            coefficients[i] = coeff[i+1];
-        for (int i = 1; i < rank; i++) {
+            coefficients[rank-1] -= coefficients[i];
-            double d = 0;
-            for (int j = 0; j < effectiveRank; j++)
-                d -= u[0][j]*v[i][j]/s[j];
-            coefficients[i-1] = d;
        }
    }
    diisCoefficients->upload(coefficients, false);