Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
805bba93
Commit
805bba93
authored
Oct 17, 2016
by
Peter Eastman
Browse files
Solve DIIS matrix on GPU
parent
388bf972
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
129 additions
and
47 deletions
+129
-47
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+15
-45
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+3
-2
plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
...moeba/platforms/cuda/src/kernels/multipoleInducedField.cu
+111
-0
No files found.
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
View file @
805bba93
...
...
@@ -52,10 +52,10 @@
using
namespace
OpenMM
;
using
namespace
std
;
#define CHECK_RESULT(result) \
#define CHECK_RESULT(result
, prefix
) \
if (result != CUDA_SUCCESS) { \
std::stringstream m; \
m<<
errorMessage
<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
m<<
prefix
<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
...
...
@@ -813,7 +813,7 @@ private:
};
CudaCalcAmoebaMultipoleForceKernel
::
CudaCalcAmoebaMultipoleForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
const
System
&
system
)
:
CalcAmoebaMultipoleForceKernel
(
name
,
platform
),
cu
(
cu
),
system
(
system
),
hasInitializedScaleFactors
(
false
),
hasInitializedFFT
(
false
),
multipolesAreValid
(
false
),
CalcAmoebaMultipoleForceKernel
(
name
,
platform
),
cu
(
cu
),
system
(
system
),
hasInitializedScaleFactors
(
false
),
hasInitializedFFT
(
false
),
multipolesAreValid
(
false
),
hasCreatedEvent
(
false
),
multipoleParticles
(
NULL
),
molecularDipoles
(
NULL
),
molecularQuadrupoles
(
NULL
),
labFrameDipoles
(
NULL
),
labFrameQuadrupoles
(
NULL
),
sphericalDipoles
(
NULL
),
sphericalQuadrupoles
(
NULL
),
fracDipoles
(
NULL
),
fracQuadrupoles
(
NULL
),
field
(
NULL
),
fieldPolar
(
NULL
),
inducedField
(
NULL
),
inducedFieldPolar
(
NULL
),
torque
(
NULL
),
dampingAndThole
(
NULL
),
inducedDipole
(
NULL
),
diisCoefficients
(
NULL
),
inducedDipolePolar
(
NULL
),
inducedDipoleErrors
(
NULL
),
prevDipoles
(
NULL
),
prevDipolesPolar
(
NULL
),
prevDipolesGk
(
NULL
),
...
...
@@ -933,6 +933,8 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
delete
sort
;
if
(
hasInitializedFFT
)
cufftDestroy
(
fft
);
if
(
hasCreatedEvent
)
cuEventDestroy
(
syncEvent
);
}
void
CudaCalcAmoebaMultipoleForceKernel
::
initialize
(
const
System
&
system
,
const
AmoebaMultipoleForce
&
force
)
{
...
...
@@ -1019,6 +1021,8 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
prevErrors
=
new
CudaArray
(
cu
,
3
*
numMultipoles
*
MaxPrevDIISDipoles
,
elementSize
,
"prevErrors"
);
diisMatrix
=
new
CudaArray
(
cu
,
MaxPrevDIISDipoles
*
MaxPrevDIISDipoles
,
elementSize
,
"diisMatrix"
);
diisCoefficients
=
new
CudaArray
(
cu
,
MaxPrevDIISDipoles
+
1
,
sizeof
(
float
),
"diisMatrix"
);
CHECK_RESULT
(
cuEventCreate
(
&
syncEvent
,
CU_EVENT_DISABLE_TIMING
),
"Error creating event for AmoebaMultipoleForce"
);
hasCreatedEvent
=
true
;
}
else
if
(
polarizationType
==
AmoebaMultipoleForce
::
Extrapolated
)
{
int
numOrders
=
force
.
getExtrapolationCoefficients
().
size
();
...
...
@@ -1210,6 +1214,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
updateInducedFieldKernel
=
cu
.
getKernel
(
module
,
"updateInducedFieldByDIIS"
);
recordDIISDipolesKernel
=
cu
.
getKernel
(
module
,
"recordInducedDipolesForDIIS"
);
buildMatrixKernel
=
cu
.
getKernel
(
module
,
"computeDIISMatrix"
);
solveMatrixKernel
=
cu
.
getKernel
(
module
,
"solveDIISMatrix"
);
initExtrapolatedKernel
=
cu
.
getKernel
(
module
,
"initExtrapolatedDipoles"
);
iterateExtrapolatedKernel
=
cu
.
getKernel
(
module
,
"iterateExtrapolatedDipoles"
);
computeExtrapolatedKernel
=
cu
.
getKernel
(
module
,
"computeExtrapolatedDipoles"
);
...
...
@@ -1820,6 +1825,7 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
cu
.
executeKernel
(
recordDIISDipolesKernel
,
recordDIISDipolesArgs
,
cu
.
getNumThreadBlocks
()
*
cu
.
ThreadBlockSize
,
cu
.
ThreadBlockSize
,
cu
.
ThreadBlockSize
*
elementSize
*
2
);
float2
*
errors
=
(
float2
*
)
cu
.
getPinnedBuffer
();
inducedDipoleErrors
->
download
(
errors
,
false
);
cuEventRecord
(
syncEvent
,
cu
.
getCurrentStream
());
// Build the DIIS matrix.
...
...
@@ -1828,15 +1834,15 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
int
threadBlocks
=
min
(
numPrev
,
cu
.
getNumThreadBlocks
());
int
blockSize
=
512
;
cu
.
executeKernel
(
buildMatrixKernel
,
buildMatrixArgs
,
threadBlocks
*
blockSize
,
blockSize
,
blockSize
*
elementSize
);
vector
<
float
>
matrixf
;
vector
<
double
>
matrix
;
if
(
cu
.
getUseDoublePrecision
())
diisMatrix
->
download
(
matrix
);
else
diisMatrix
->
download
(
matrixf
);
// Solve the matrix.
void
*
solveMatrixArgs
[]
=
{
&
iteration
,
&
diisMatrix
->
getDevicePointer
(),
&
diisCoefficients
->
getDevicePointer
()};
cu
.
executeKernel
(
solveMatrixKernel
,
solveMatrixArgs
,
32
,
32
);
// Determine whether the iteration has converged.
cuEventSynchronize
(
syncEvent
);
double
total1
=
0.0
,
total2
=
0.0
;
for
(
int
j
=
0
;
j
<
inducedDipoleErrors
->
getSize
();
j
++
)
{
total1
+=
errors
[
j
].
x
;
...
...
@@ -1844,42 +1850,6 @@ bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
}
if
(
48.033324
*
sqrt
(
max
(
total1
,
total2
)
/
cu
.
getNumAtoms
())
<
inducedEpsilon
)
return
true
;
// Compute the coefficients for selecting the new dipoles.
float
*
coefficients
=
(
float
*
)
cu
.
getPinnedBuffer
();
if
(
iteration
==
0
)
coefficients
[
0
]
=
1
;
else
{
int
rank
=
numPrev
+
1
;
Array2D
<
double
>
b
(
rank
,
rank
);
b
[
0
][
0
]
=
0
;
for
(
int
i
=
1
;
i
<
rank
;
i
++
)
b
[
i
][
0
]
=
b
[
0
][
i
]
=
-
1
;
if
(
cu
.
getUseDoublePrecision
())
{
for
(
int
i
=
0
;
i
<
numPrev
;
i
++
)
for
(
int
j
=
0
;
j
<
numPrev
;
j
++
)
b
[
i
+
1
][
j
+
1
]
=
matrix
[
i
*
MaxPrevDIISDipoles
+
j
];
}
else
{
for
(
int
i
=
0
;
i
<
numPrev
;
i
++
)
for
(
int
j
=
0
;
j
<
numPrev
;
j
++
)
b
[
i
+
1
][
j
+
1
]
=
matrixf
[
i
*
MaxPrevDIISDipoles
+
j
];
}
// Solve using LU.
JAMA
::
LU
<
double
>
lu
(
b
);
Array1D
<
double
>
x
(
rank
,
0.0
);
x
[
0
]
=
-
1.0
;
Array1D
<
double
>
coeff
=
lu
.
solve
(
x
);
coefficients
[
rank
-
1
]
=
1.0
;
for
(
int
i
=
0
;
i
<
rank
-
1
;
i
++
)
{
coefficients
[
i
]
=
coeff
[
i
+
1
];
coefficients
[
rank
-
1
]
-=
coefficients
[
i
];
}
}
diisCoefficients
->
upload
(
coefficients
,
false
);
// Compute the dipoles.
...
...
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
View file @
805bba93
...
...
@@ -408,7 +408,7 @@ private:
int
fixedFieldThreads
,
inducedFieldThreads
,
electrostaticsThreads
;
int
gridSizeX
,
gridSizeY
,
gridSizeZ
;
double
alpha
,
inducedEpsilon
;
bool
usePME
,
hasQuadrupoles
,
hasInitializedScaleFactors
,
hasInitializedFFT
,
multipolesAreValid
;
bool
usePME
,
hasQuadrupoles
,
hasInitializedScaleFactors
,
hasInitializedFFT
,
multipolesAreValid
,
hasCreatedEvent
;
AmoebaMultipoleForce
::
PolarizationType
polarizationType
;
CudaContext
&
cu
;
const
System
&
system
;
...
...
@@ -471,9 +471,10 @@ private:
CUfunction
computeMomentsKernel
,
recordInducedDipolesKernel
,
computeFixedFieldKernel
,
computeInducedFieldKernel
,
updateInducedFieldKernel
,
electrostaticsKernel
,
mapTorqueKernel
;
CUfunction
pmeSpreadFixedMultipolesKernel
,
pmeSpreadInducedDipolesKernel
,
pmeFinishSpreadChargeKernel
,
pmeConvolutionKernel
;
CUfunction
pmeFixedPotentialKernel
,
pmeInducedPotentialKernel
,
pmeFixedForceKernel
,
pmeInducedForceKernel
,
pmeRecordInducedFieldDipolesKernel
,
computePotentialKernel
;
CUfunction
recordDIISDipolesKernel
,
buildMatrixKernel
;
CUfunction
recordDIISDipolesKernel
,
buildMatrixKernel
,
solveMatrixKernel
;
CUfunction
initExtrapolatedKernel
,
iterateExtrapolatedKernel
,
computeExtrapolatedKernel
,
addExtrapolatedGradientKernel
;
CUfunction
pmeTransformMultipolesKernel
,
pmeTransformPotentialKernel
;
CUevent
syncEvent
;
CudaCalcAmoebaGeneralizedKirkwoodForceKernel
*
gkKernel
;
static
const
int
PmeOrder
=
5
;
static
const
int
MaxPrevDIISDipoles
=
20
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
View file @
805bba93
...
...
@@ -694,6 +694,117 @@ extern "C" __global__ void computeDIISMatrix(real* __restrict__ prevErrors, int
}
}
extern
"C"
__global__
void
solveDIISMatrix
(
int
iteration
,
const
real
*
__restrict__
matrix
,
float
*
__restrict__
coefficients
)
{
__shared__
real
b
[
MAX_PREV_DIIS_DIPOLES
+
1
][
MAX_PREV_DIIS_DIPOLES
+
1
];
__shared__
real
piv
[
MAX_PREV_DIIS_DIPOLES
+
1
];
__shared__
real
x
[
MAX_PREV_DIIS_DIPOLES
+
1
];
// On the first iteration we don't need to do any calculation.
if
(
iteration
==
0
)
{
if
(
threadIdx
.
x
==
0
)
coefficients
[
0
]
=
1
;
return
;
}
// Load the matrix.
int
numPrev
=
min
(
iteration
+
1
,
MAX_PREV_DIIS_DIPOLES
);
int
rank
=
numPrev
+
1
;
for
(
int
index
=
threadIdx
.
x
;
index
<
numPrev
*
numPrev
;
index
+=
blockDim
.
x
)
{
int
i
=
index
/
numPrev
;
int
j
=
index
-
i
*
numPrev
;
b
[
i
+
1
][
j
+
1
]
=
matrix
[
i
*
MAX_PREV_DIIS_DIPOLES
+
j
];
}
for
(
int
i
=
threadIdx
.
x
;
i
<
rank
;
i
+=
blockDim
.
x
)
{
b
[
i
][
0
]
=
-
1
;
piv
[
i
]
=
i
;
}
__syncthreads
();
// Compute the mean absolute value of the values we just loaded. We use that for preconditioning it,
// which is essential for doing the computation in single precision.
if
(
threadIdx
.
x
==
0
)
{
real
mean
=
0
;
for
(
int
i
=
0
;
i
<
numPrev
;
i
++
)
for
(
int
j
=
0
;
j
<
numPrev
;
j
++
)
mean
+=
b
[
i
+
1
][
j
+
1
];
mean
/=
numPrev
*
numPrev
;
b
[
0
][
0
]
=
0
;
for
(
int
i
=
1
;
i
<
rank
;
i
++
)
b
[
0
][
i
]
=
-
mean
;
// Compute the LU decomposition of the matrix. This code is adapted from JAMA.
int
pivsign
=
1
;
for
(
int
j
=
0
;
j
<
rank
;
j
++
)
{
// Apply previous transformations.
for
(
int
i
=
0
;
i
<
rank
;
i
++
)
{
// Most of the time is spent in the following dot product.
int
kmax
=
min
(
i
,
j
);
real
s
=
0
;
for
(
int
k
=
0
;
k
<
kmax
;
k
++
)
s
+=
b
[
i
][
k
]
*
b
[
k
][
j
];
b
[
i
][
j
]
-=
s
;
}
// Find pivot and exchange if necessary.
int
p
=
j
;
for
(
int
i
=
j
+
1
;
i
<
rank
;
i
++
)
if
(
abs
(
b
[
i
][
j
])
>
abs
(
b
[
p
][
j
]))
p
=
i
;
if
(
p
!=
j
)
{
int
k
=
0
;
for
(
k
=
0
;
k
<
rank
;
k
++
)
{
real
t
=
b
[
p
][
k
];
b
[
p
][
k
]
=
b
[
j
][
k
];
b
[
j
][
k
]
=
t
;
}
k
=
piv
[
p
];
piv
[
p
]
=
piv
[
j
];
piv
[
j
]
=
k
;
pivsign
=
-
pivsign
;
}
// Compute multipliers.
if
((
j
<
rank
)
&&
(
b
[
j
][
j
]
!=
0
))
for
(
int
i
=
j
+
1
;
i
<
rank
;
i
++
)
b
[
i
][
j
]
/=
b
[
j
][
j
];
}
// Solve b*Y = X(piv)
for
(
int
i
=
0
;
i
<
rank
;
i
++
)
x
[
i
]
=
(
piv
[
i
]
==
0
?
-
1
:
0
);
for
(
int
k
=
0
;
k
<
rank
;
k
++
)
for
(
int
i
=
k
+
1
;
i
<
rank
;
i
++
)
x
[
i
]
-=
x
[
k
]
*
b
[
i
][
k
];
// Solve U*X = Y;
for
(
int
k
=
rank
-
1
;
k
>=
0
;
k
--
)
{
x
[
k
]
/=
b
[
k
][
k
];
for
(
int
i
=
0
;
i
<
k
;
i
++
)
x
[
i
]
-=
x
[
k
]
*
b
[
i
][
k
];
}
// Record the coefficients.
real
lastCoeff
=
1
;
for
(
int
i
=
0
;
i
<
rank
-
1
;
i
++
)
{
real
c
=
x
[
i
+
1
]
*
mean
;
coefficients
[
i
]
=
c
;
lastCoeff
-=
c
;
}
coefficients
[
rank
-
1
]
=
lastCoeff
;
}
}
extern
"C"
__global__
void
updateInducedFieldByDIIS
(
real
*
__restrict__
inducedDipole
,
real
*
__restrict__
inducedDipolePolar
,
const
real
*
__restrict__
prevDipoles
,
const
real
*
__restrict__
prevDipolesPolar
,
const
float
*
__restrict__
coefficients
,
int
numPrev
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
3
*
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment