Commit 805bba93 authored by Peter Eastman's avatar Peter Eastman
Browse files

Solve DIIS matrix on GPU

parent 388bf972
......@@ -52,10 +52,10 @@
using namespace OpenMM;
using namespace std;
#define CHECK_RESULT(result) \
#define CHECK_RESULT(result, prefix) \
if (result != CUDA_SUCCESS) { \
std::stringstream m; \
m<<errorMessage<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
m<<prefix<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
......@@ -813,7 +813,7 @@ private:
};
CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) :
CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false), hasInitializedFFT(false), multipolesAreValid(false),
CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false), hasInitializedFFT(false), multipolesAreValid(false), hasCreatedEvent(false),
multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL), labFrameDipoles(NULL), labFrameQuadrupoles(NULL), sphericalDipoles(NULL), sphericalQuadrupoles(NULL),
fracDipoles(NULL), fracQuadrupoles(NULL), field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL), inducedDipole(NULL),
diisCoefficients(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), prevDipoles(NULL), prevDipolesPolar(NULL), prevDipolesGk(NULL),
......@@ -933,6 +933,8 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
delete sort;
if (hasInitializedFFT)
cufftDestroy(fft);
if (hasCreatedEvent)
cuEventDestroy(syncEvent);
}
void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const AmoebaMultipoleForce& force) {
......@@ -1019,6 +1021,8 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
prevErrors = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevErrors");
diisMatrix = new CudaArray(cu, MaxPrevDIISDipoles*MaxPrevDIISDipoles, elementSize, "diisMatrix");
diisCoefficients = new CudaArray(cu, MaxPrevDIISDipoles+1, sizeof(float), "diisMatrix");
CHECK_RESULT(cuEventCreate(&syncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for AmoebaMultipoleForce");
hasCreatedEvent = true;
}
else if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
int numOrders = force.getExtrapolationCoefficients().size();
......@@ -1210,6 +1214,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
updateInducedFieldKernel = cu.getKernel(module, "updateInducedFieldByDIIS");
recordDIISDipolesKernel = cu.getKernel(module, "recordInducedDipolesForDIIS");
buildMatrixKernel = cu.getKernel(module, "computeDIISMatrix");
solveMatrixKernel = cu.getKernel(module, "solveDIISMatrix");
initExtrapolatedKernel = cu.getKernel(module, "initExtrapolatedDipoles");
iterateExtrapolatedKernel = cu.getKernel(module, "iterateExtrapolatedDipoles");
computeExtrapolatedKernel = cu.getKernel(module, "computeExtrapolatedDipoles");
......@@ -1820,6 +1825,7 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
cu.executeKernel(recordDIISDipolesKernel, recordDIISDipolesArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
float2* errors = (float2*) cu.getPinnedBuffer();
inducedDipoleErrors->download(errors, false);
cuEventRecord(syncEvent, cu.getCurrentStream());
// Build the DIIS matrix.
......@@ -1828,15 +1834,15 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
int threadBlocks = min(numPrev, cu.getNumThreadBlocks());
int blockSize = 512;
cu.executeKernel(buildMatrixKernel, buildMatrixArgs, threadBlocks*blockSize, blockSize, blockSize*elementSize);
vector<float> matrixf;
vector<double> matrix;
if (cu.getUseDoublePrecision())
diisMatrix->download(matrix);
else
diisMatrix->download(matrixf);
// Solve the matrix.
void* solveMatrixArgs[] = {&iteration, &diisMatrix->getDevicePointer(), &diisCoefficients->getDevicePointer()};
cu.executeKernel(solveMatrixKernel, solveMatrixArgs, 32, 32);
// Determine whether the iteration has converged.
cuEventSynchronize(syncEvent);
double total1 = 0.0, total2 = 0.0;
for (int j = 0; j < inducedDipoleErrors->getSize(); j++) {
total1 += errors[j].x;
......@@ -1844,42 +1850,6 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
}
if (48.033324*sqrt(max(total1, total2)/cu.getNumAtoms()) < inducedEpsilon)
return true;
// Compute the coefficients for selecting the new dipoles.
float* coefficients = (float*) cu.getPinnedBuffer();
if (iteration == 0)
coefficients[0] = 1;
else {
int rank = numPrev+1;
Array2D<double> b(rank, rank);
b[0][0] = 0;
for (int i = 1; i < rank; i++)
b[i][0] = b[0][i] = -1;
if (cu.getUseDoublePrecision()) {
for (int i = 0; i < numPrev; i++)
for (int j = 0; j < numPrev; j++)
b[i+1][j+1] = matrix[i*MaxPrevDIISDipoles+j];
}
else {
for (int i = 0; i < numPrev; i++)
for (int j = 0; j < numPrev; j++)
b[i+1][j+1] = matrixf[i*MaxPrevDIISDipoles+j];
}
// Solve using LU.
JAMA::LU<double> lu(b);
Array1D<double> x(rank, 0.0);
x[0] = -1.0;
Array1D<double> coeff = lu.solve(x);
coefficients[rank-1] = 1.0;
for (int i = 0; i < rank-1; i++) {
coefficients[i] = coeff[i+1];
coefficients[rank-1] -= coefficients[i];
}
}
diisCoefficients->upload(coefficients, false);
// Compute the dipoles.
......
......@@ -408,7 +408,7 @@ private:
int fixedFieldThreads, inducedFieldThreads, electrostaticsThreads;
int gridSizeX, gridSizeY, gridSizeZ;
double alpha, inducedEpsilon;
bool usePME, hasQuadrupoles, hasInitializedScaleFactors, hasInitializedFFT, multipolesAreValid;
bool usePME, hasQuadrupoles, hasInitializedScaleFactors, hasInitializedFFT, multipolesAreValid, hasCreatedEvent;
AmoebaMultipoleForce::PolarizationType polarizationType;
CudaContext& cu;
const System& system;
......@@ -471,9 +471,10 @@ private:
CUfunction computeMomentsKernel, recordInducedDipolesKernel, computeFixedFieldKernel, computeInducedFieldKernel, updateInducedFieldKernel, electrostaticsKernel, mapTorqueKernel;
CUfunction pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel;
CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel;
CUfunction recordDIISDipolesKernel, buildMatrixKernel;
CUfunction recordDIISDipolesKernel, buildMatrixKernel, solveMatrixKernel;
CUfunction initExtrapolatedKernel, iterateExtrapolatedKernel, computeExtrapolatedKernel, addExtrapolatedGradientKernel;
CUfunction pmeTransformMultipolesKernel, pmeTransformPotentialKernel;
CUevent syncEvent;
CudaCalcAmoebaGeneralizedKirkwoodForceKernel* gkKernel;
static const int PmeOrder = 5;
static const int MaxPrevDIISDipoles = 20;
......
......@@ -694,6 +694,117 @@ extern "C" __global__ void computeDIISMatrix(real* __restrict__ prevErrors, int
}
}
extern "C" __global__ void solveDIISMatrix(int iteration, const real* __restrict__ matrix, float* __restrict__ coefficients) {
__shared__ real b[MAX_PREV_DIIS_DIPOLES+1][MAX_PREV_DIIS_DIPOLES+1];
__shared__ real piv[MAX_PREV_DIIS_DIPOLES+1];
__shared__ real x[MAX_PREV_DIIS_DIPOLES+1];
// On the first iteration we don't need to do any calculation.
if (iteration == 0) {
if (threadIdx.x == 0)
coefficients[0] = 1;
return;
}
// Load the matrix.
int numPrev = min(iteration+1, MAX_PREV_DIIS_DIPOLES);
int rank = numPrev+1;
for (int index = threadIdx.x; index < numPrev*numPrev; index += blockDim.x) {
int i = index/numPrev;
int j = index-i*numPrev;
b[i+1][j+1] = matrix[i*MAX_PREV_DIIS_DIPOLES+j];
}
for (int i = threadIdx.x; i < rank; i += blockDim.x) {
b[i][0] = -1;
piv[i] = i;
}
__syncthreads();
// Compute the mean absolute value of the values we just loaded. We use that for preconditioning it,
// which is essential for doing the computation in single precision.
if (threadIdx.x == 0) {
real mean = 0;
for (int i = 0; i < numPrev; i++)
for (int j = 0; j < numPrev; j++)
mean += b[i+1][j+1];
mean /= numPrev*numPrev;
b[0][0] = 0;
for (int i = 1; i < rank; i++)
b[0][i] = -mean;
// Compute the LU decomposition of the matrix. This code is adapted from JAMA.
int pivsign = 1;
for (int j = 0; j < rank; j++) {
// Apply previous transformations.
for (int i = 0; i < rank; i++) {
// Most of the time is spent in the following dot product.
int kmax = min(i, j);
real s = 0;
for (int k = 0; k < kmax; k++)
s += b[i][k] * b[k][j];
b[i][j] -= s;
}
// Find pivot and exchange if necessary.
int p = j;
for (int i = j+1; i < rank; i++)
if (abs(b[i][j]) > abs(b[p][j]))
p = i;
if (p != j) {
int k = 0;
for (k = 0; k < rank; k++) {
real t = b[p][k];
b[p][k] = b[j][k];
b[j][k] = t;
}
k = piv[p];
piv[p] = piv[j];
piv[j] = k;
pivsign = -pivsign;
}
// Compute multipliers.
if ((j < rank) && (b[j][j] != 0))
for (int i = j+1; i < rank; i++)
b[i][j] /= b[j][j];
}
// Solve b*Y = X(piv)
for (int i = 0; i < rank; i++)
x[i] = (piv[i] == 0 ? -1 : 0);
for (int k = 0; k < rank; k++)
for (int i = k+1; i < rank; i++)
x[i] -= x[k] * b[i][k];
// Solve U*X = Y;
for (int k = rank-1; k >= 0; k--) {
x[k] /= b[k][k];
for (int i = 0; i < k; i++)
x[i] -= x[k] * b[i][k];
}
// Record the coefficients.
real lastCoeff = 1;
for (int i = 0; i < rank-1; i++) {
real c = x[i+1]*mean;
coefficients[i] = c;
lastCoeff -= c;
}
coefficients[rank-1] = lastCoeff;
}
}
extern "C" __global__ void updateInducedFieldByDIIS(real* __restrict__ inducedDipole, real* __restrict__ inducedDipolePolar,
const real* __restrict__ prevDipoles, const real* __restrict__ prevDipolesPolar, const float* __restrict__ coefficients, int numPrev) {
for (int index = blockIdx.x*blockDim.x + threadIdx.x; index < 3*NUM_ATOMS; index += blockDim.x*gridDim.x) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment