"wrappers/python/src/vscode:/vscode.git/clone" did not exist on "743c4c876dd63727e23d9758a16712ea22f588d2"
Commit bbbc3160 authored by peastman's avatar peastman
Browse files

Optimization to building DIIS matrix

parent a346a6db
......@@ -978,7 +978,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
prevDipoles = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipoles");
prevDipolesPolar = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipolesPolar");
prevErrors = new CudaArray(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevErrors");
diisMatrix = new CudaArray(cu, (MaxPrevDIISDipoles+1)*(MaxPrevDIISDipoles+1), elementSize, "diisMatrix");
diisMatrix = new CudaArray(cu, MaxPrevDIISDipoles*MaxPrevDIISDipoles, elementSize, "diisMatrix");
diisCoefficients = new CudaArray(cu, MaxPrevDIISDipoles+1, sizeof(float), "diisMatrix");
cu.addAutoclearBuffer(*field);
cu.addAutoclearBuffer(*fieldPolar);
......@@ -1625,33 +1625,37 @@ double CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
void* recordDIISDipolesGkArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &gkKernel->getField()->getDevicePointer(), &gkKernel->getInducedField()->getDevicePointer(),
&gkKernel->getInducedFieldPolar()->getDevicePointer(), &gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(),
&polarizability->getDevicePointer(), &inducedDipoleErrors->getDevicePointer(), &prevDipolesGk->getDevicePointer(),
&prevDipolesGkPolar->getDevicePointer(), &prevErrors->getDevicePointer(), &iteration, &falseValue};
&prevDipolesGkPolar->getDevicePointer(), &prevErrors->getDevicePointer(), &iteration, &falseValue, &diisMatrix->getDevicePointer()};
cu.executeKernel(recordDIISDipolesKernel, recordDIISDipolesGkArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
}
void* recordDIISDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &npt, &inducedField->getDevicePointer(),
&inducedFieldPolar->getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
&polarizability->getDevicePointer(), &inducedDipoleErrors->getDevicePointer(), &prevDipoles->getDevicePointer(),
&prevDipolesPolar->getDevicePointer(), &prevErrors->getDevicePointer(), &iteration, &trueValue};
&prevDipolesPolar->getDevicePointer(), &prevErrors->getDevicePointer(), &iteration, &trueValue, &diisMatrix->getDevicePointer()};
cu.executeKernel(recordDIISDipolesKernel, recordDIISDipolesArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
float2* errors = (float2*) cu.getPinnedBuffer();
inducedDipoleErrors->download(errors, false);
// Determine the coefficients for selecting the new dipoles.
vector<float> coefficients(MaxPrevDIISDipoles);
int numPrev = (iteration+1 < MaxPrevDIISDipoles ? iteration+1 : MaxPrevDIISDipoles);
void* buildMatrixArgs[] = {&prevErrors->getDevicePointer(), &iteration, &diisMatrix->getDevicePointer()};
int threadBlocks = min(numPrev, cu.getNumThreadBlocks());
cu.executeKernel(buildMatrixKernel, buildMatrixArgs, threadBlocks*128, 128, 128*elementSize);
vector<float> coefficients(MaxPrevDIISDipoles);
if (iteration == 0)
coefficients[0] = 1;
else {
void* buildMatrixArgs[] = {&prevErrors->getDevicePointer(), &iteration, &diisMatrix->getDevicePointer()};
cu.executeKernel(buildMatrixKernel, buildMatrixArgs, cu.getNumThreadBlocks()*128, 128, 128*elementSize);
vector<float> matrix;
diisMatrix->download(matrix);
int rank = numPrev+1;
Array2D<double> b(rank, rank);
for (int i = 0; i < rank; i++)
for (int j = 0; j < rank; j++)
b[i][j] = matrix[i*rank+j];
b[0][0] = 0;
for (int i = 1; i < rank; i++)
b[i][0] = b[0][i] = -1;
for (int i = 0; i < numPrev; i++)
for (int j = 0; j < numPrev; j++)
b[i+1][j+1] = matrix[i*MaxPrevDIISDipoles+j];
// Solve using SVD. Since the right hand side is (-1, 0, 0, 0, ...), this is simpler than the general case.
......
......@@ -499,7 +499,7 @@ extern "C" __global__ void updateInducedFieldBySOR(const long long* __restrict__
extern "C" __global__ void recordInducedDipolesForDIIS(const long long* __restrict__ fixedField, const long long* __restrict__ fixedFieldPolar,
const long long* __restrict__ fixedFieldS, const long long* __restrict__ inducedField, const long long* __restrict__ inducedFieldPolar,
const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, const float* __restrict__ polarizability, float2* __restrict__ errors,
real* __restrict__ prevDipoles, real* __restrict__ prevDipolesPolar, real* __restrict__ prevErrors, int iteration, bool recordPrevErrors) {
real* __restrict__ prevDipoles, real* __restrict__ prevDipolesPolar, real* __restrict__ prevErrors, int iteration, bool recordPrevErrors, real* __restrict__ matrix) {
extern __shared__ real2 buffer[];
#ifdef USE_EWALD
const real ewaldScale = (4/(real) 3)*(EWALD_ALPHA*EWALD_ALPHA*EWALD_ALPHA)/SQRT_PI;
......@@ -557,29 +557,27 @@ extern "C" __global__ void recordInducedDipolesForDIIS(const long long* __restri
}
if (threadIdx.x == 0)
errors[blockIdx.x] = make_float2((float) buffer[0].x, (float) buffer[0].y);
if (iteration >= MAX_PREV_DIIS_DIPOLES && recordPrevErrors && blockIdx.x == 0) {
// Shift over the existing matrix elements.
for (int i = 0; i < MAX_PREV_DIIS_DIPOLES-1; i++) {
if (threadIdx.x < MAX_PREV_DIIS_DIPOLES-1)
matrix[threadIdx.x+i*MAX_PREV_DIIS_DIPOLES] = matrix[(threadIdx.x+1)+(i+1)*MAX_PREV_DIIS_DIPOLES];
__syncthreads();
}
}
}
extern "C" __global__ void computeDIISMatrix(real* __restrict__ prevErrors, int iteration, real* __restrict__ matrix) {
extern __shared__ real sumBuffer[];
int rank = min(iteration+1, MAX_PREV_DIIS_DIPOLES)+1;
for (int element = blockIdx.x; element < rank*rank; element += gridDim.x) {
int j = min(iteration, MAX_PREV_DIIS_DIPOLES-1);
for (int i = blockIdx.x; i <= j; i += gridDim.x) {
// All the threads in this thread block work together to compute a single matrix element.
int i = element/rank;
int j = element-i*rank;
if (i > j)
continue;
real value;
if (i == 0 && j == 0)
value = 0;
else if (i == 0 || j == 0)
value = -1;
else {
// Compute the inner product of the two error vectors.
real sum = 0;
for (int index = threadIdx.x; index < NUM_ATOMS*3; index += blockDim.x)
sum += prevErrors[index+(i-1)*NUM_ATOMS*3]*prevErrors[index+(j-1)*NUM_ATOMS*3];
sum += prevErrors[index+i*NUM_ATOMS*3]*prevErrors[index+j*NUM_ATOMS*3];
sumBuffer[threadIdx.x] = sum;
__syncthreads();
for (int offset = 1; offset < blockDim.x; offset *= 2) {
......@@ -587,13 +585,10 @@ extern "C" __global__ void computeDIISMatrix(real* __restrict__ prevErrors, int
sumBuffer[threadIdx.x] += sumBuffer[threadIdx.x+offset];
__syncthreads();
}
value = sumBuffer[0];
}
__syncthreads();
if (threadIdx.x == 0) {
matrix[element] = value;
matrix[i+MAX_PREV_DIIS_DIPOLES*j] = sumBuffer[0];
if (i != j)
matrix[j*rank+i] = value;
matrix[j+MAX_PREV_DIIS_DIPOLES*i] = sumBuffer[0];
}
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment