Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
bbbc3160
Commit
bbbc3160
authored
Jul 21, 2014
by
peastman
Browse files
Optimization to building DIIS matrix
parent
a346a6db
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
37 additions
and
38 deletions
+37
-38
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+13
-9
plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
...moeba/platforms/cuda/src/kernels/multipoleInducedField.cu
+24
-29
No files found.
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
View file @
bbbc3160
...
@@ -978,7 +978,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
...
@@ -978,7 +978,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
prevDipoles
=
new
CudaArray
(
cu
,
3
*
numMultipoles
*
MaxPrevDIISDipoles
,
elementSize
,
"prevDipoles"
);
prevDipoles
=
new
CudaArray
(
cu
,
3
*
numMultipoles
*
MaxPrevDIISDipoles
,
elementSize
,
"prevDipoles"
);
prevDipolesPolar
=
new
CudaArray
(
cu
,
3
*
numMultipoles
*
MaxPrevDIISDipoles
,
elementSize
,
"prevDipolesPolar"
);
prevDipolesPolar
=
new
CudaArray
(
cu
,
3
*
numMultipoles
*
MaxPrevDIISDipoles
,
elementSize
,
"prevDipolesPolar"
);
prevErrors
=
new
CudaArray
(
cu
,
3
*
numMultipoles
*
MaxPrevDIISDipoles
,
elementSize
,
"prevErrors"
);
prevErrors
=
new
CudaArray
(
cu
,
3
*
numMultipoles
*
MaxPrevDIISDipoles
,
elementSize
,
"prevErrors"
);
diisMatrix
=
new
CudaArray
(
cu
,
(
MaxPrevDIISDipoles
+
1
)
*
(
MaxPrevDIISDipoles
+
1
)
,
elementSize
,
"diisMatrix"
);
diisMatrix
=
new
CudaArray
(
cu
,
MaxPrevDIISDipoles
*
MaxPrevDIISDipoles
,
elementSize
,
"diisMatrix"
);
diisCoefficients
=
new
CudaArray
(
cu
,
MaxPrevDIISDipoles
+
1
,
sizeof
(
float
),
"diisMatrix"
);
diisCoefficients
=
new
CudaArray
(
cu
,
MaxPrevDIISDipoles
+
1
,
sizeof
(
float
),
"diisMatrix"
);
cu
.
addAutoclearBuffer
(
*
field
);
cu
.
addAutoclearBuffer
(
*
field
);
cu
.
addAutoclearBuffer
(
*
fieldPolar
);
cu
.
addAutoclearBuffer
(
*
fieldPolar
);
...
@@ -1625,33 +1625,37 @@ double CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
...
@@ -1625,33 +1625,37 @@ double CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
void
*
recordDIISDipolesGkArgs
[]
=
{
&
field
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
&
gkKernel
->
getField
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedField
()
->
getDevicePointer
(),
void
*
recordDIISDipolesGkArgs
[]
=
{
&
field
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
&
gkKernel
->
getField
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedField
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedFieldPolar
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedDipoles
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedDipolesPolar
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedFieldPolar
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedDipoles
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedDipolesPolar
()
->
getDevicePointer
(),
&
polarizability
->
getDevicePointer
(),
&
inducedDipoleErrors
->
getDevicePointer
(),
&
prevDipolesGk
->
getDevicePointer
(),
&
polarizability
->
getDevicePointer
(),
&
inducedDipoleErrors
->
getDevicePointer
(),
&
prevDipolesGk
->
getDevicePointer
(),
&
prevDipolesGkPolar
->
getDevicePointer
(),
&
prevErrors
->
getDevicePointer
(),
&
iteration
,
&
falseValue
};
&
prevDipolesGkPolar
->
getDevicePointer
(),
&
prevErrors
->
getDevicePointer
(),
&
iteration
,
&
falseValue
,
&
diisMatrix
->
getDevicePointer
()
};
cu
.
executeKernel
(
recordDIISDipolesKernel
,
recordDIISDipolesGkArgs
,
cu
.
getNumThreadBlocks
()
*
cu
.
ThreadBlockSize
,
cu
.
ThreadBlockSize
,
cu
.
ThreadBlockSize
*
elementSize
*
2
);
cu
.
executeKernel
(
recordDIISDipolesKernel
,
recordDIISDipolesGkArgs
,
cu
.
getNumThreadBlocks
()
*
cu
.
ThreadBlockSize
,
cu
.
ThreadBlockSize
,
cu
.
ThreadBlockSize
*
elementSize
*
2
);
}
}
void
*
recordDIISDipolesArgs
[]
=
{
&
field
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
&
npt
,
&
inducedField
->
getDevicePointer
(),
void
*
recordDIISDipolesArgs
[]
=
{
&
field
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
&
npt
,
&
inducedField
->
getDevicePointer
(),
&
inducedFieldPolar
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
inducedFieldPolar
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
polarizability
->
getDevicePointer
(),
&
inducedDipoleErrors
->
getDevicePointer
(),
&
prevDipoles
->
getDevicePointer
(),
&
polarizability
->
getDevicePointer
(),
&
inducedDipoleErrors
->
getDevicePointer
(),
&
prevDipoles
->
getDevicePointer
(),
&
prevDipolesPolar
->
getDevicePointer
(),
&
prevErrors
->
getDevicePointer
(),
&
iteration
,
&
trueValue
};
&
prevDipolesPolar
->
getDevicePointer
(),
&
prevErrors
->
getDevicePointer
(),
&
iteration
,
&
trueValue
,
&
diisMatrix
->
getDevicePointer
()
};
cu
.
executeKernel
(
recordDIISDipolesKernel
,
recordDIISDipolesArgs
,
cu
.
getNumThreadBlocks
()
*
cu
.
ThreadBlockSize
,
cu
.
ThreadBlockSize
,
cu
.
ThreadBlockSize
*
elementSize
*
2
);
cu
.
executeKernel
(
recordDIISDipolesKernel
,
recordDIISDipolesArgs
,
cu
.
getNumThreadBlocks
()
*
cu
.
ThreadBlockSize
,
cu
.
ThreadBlockSize
,
cu
.
ThreadBlockSize
*
elementSize
*
2
);
float2
*
errors
=
(
float2
*
)
cu
.
getPinnedBuffer
();
float2
*
errors
=
(
float2
*
)
cu
.
getPinnedBuffer
();
inducedDipoleErrors
->
download
(
errors
,
false
);
inducedDipoleErrors
->
download
(
errors
,
false
);
// Determine the coefficients for selecting the new dipoles.
// Determine the coefficients for selecting the new dipoles.
vector
<
float
>
coefficients
(
MaxPrevDIISDipoles
);
int
numPrev
=
(
iteration
+
1
<
MaxPrevDIISDipoles
?
iteration
+
1
:
MaxPrevDIISDipoles
);
int
numPrev
=
(
iteration
+
1
<
MaxPrevDIISDipoles
?
iteration
+
1
:
MaxPrevDIISDipoles
);
void
*
buildMatrixArgs
[]
=
{
&
prevErrors
->
getDevicePointer
(),
&
iteration
,
&
diisMatrix
->
getDevicePointer
()};
int
threadBlocks
=
min
(
numPrev
,
cu
.
getNumThreadBlocks
());
cu
.
executeKernel
(
buildMatrixKernel
,
buildMatrixArgs
,
threadBlocks
*
128
,
128
,
128
*
elementSize
);
vector
<
float
>
coefficients
(
MaxPrevDIISDipoles
);
if
(
iteration
==
0
)
if
(
iteration
==
0
)
coefficients
[
0
]
=
1
;
coefficients
[
0
]
=
1
;
else
{
else
{
void
*
buildMatrixArgs
[]
=
{
&
prevErrors
->
getDevicePointer
(),
&
iteration
,
&
diisMatrix
->
getDevicePointer
()};
cu
.
executeKernel
(
buildMatrixKernel
,
buildMatrixArgs
,
cu
.
getNumThreadBlocks
()
*
128
,
128
,
128
*
elementSize
);
vector
<
float
>
matrix
;
vector
<
float
>
matrix
;
diisMatrix
->
download
(
matrix
);
diisMatrix
->
download
(
matrix
);
int
rank
=
numPrev
+
1
;
int
rank
=
numPrev
+
1
;
Array2D
<
double
>
b
(
rank
,
rank
);
Array2D
<
double
>
b
(
rank
,
rank
);
for
(
int
i
=
0
;
i
<
rank
;
i
++
)
b
[
0
][
0
]
=
0
;
for
(
int
j
=
0
;
j
<
rank
;
j
++
)
for
(
int
i
=
1
;
i
<
rank
;
i
++
)
b
[
i
][
j
]
=
matrix
[
i
*
rank
+
j
];
b
[
i
][
0
]
=
b
[
0
][
i
]
=
-
1
;
for
(
int
i
=
0
;
i
<
numPrev
;
i
++
)
for
(
int
j
=
0
;
j
<
numPrev
;
j
++
)
b
[
i
+
1
][
j
+
1
]
=
matrix
[
i
*
MaxPrevDIISDipoles
+
j
];
// Solve using SVD. Since the right hand side is (-1, 0, 0, 0, ...), this is simpler than the general case.
// Solve using SVD. Since the right hand side is (-1, 0, 0, 0, ...), this is simpler than the general case.
...
...
plugins/amoeba/platforms/cuda/src/kernels/multipoleInducedField.cu
View file @
bbbc3160
...
@@ -499,7 +499,7 @@ extern "C" __global__ void updateInducedFieldBySOR(const long long* __restrict__
...
@@ -499,7 +499,7 @@ extern "C" __global__ void updateInducedFieldBySOR(const long long* __restrict__
extern
"C"
__global__
void
recordInducedDipolesForDIIS
(
const
long
long
*
__restrict__
fixedField
,
const
long
long
*
__restrict__
fixedFieldPolar
,
extern
"C"
__global__
void
recordInducedDipolesForDIIS
(
const
long
long
*
__restrict__
fixedField
,
const
long
long
*
__restrict__
fixedFieldPolar
,
const
long
long
*
__restrict__
fixedFieldS
,
const
long
long
*
__restrict__
inducedField
,
const
long
long
*
__restrict__
inducedFieldPolar
,
const
long
long
*
__restrict__
fixedFieldS
,
const
long
long
*
__restrict__
inducedField
,
const
long
long
*
__restrict__
inducedFieldPolar
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
const
float
*
__restrict__
polarizability
,
float2
*
__restrict__
errors
,
const
real
*
__restrict__
inducedDipole
,
const
real
*
__restrict__
inducedDipolePolar
,
const
float
*
__restrict__
polarizability
,
float2
*
__restrict__
errors
,
real
*
__restrict__
prevDipoles
,
real
*
__restrict__
prevDipolesPolar
,
real
*
__restrict__
prevErrors
,
int
iteration
,
bool
recordPrevErrors
)
{
real
*
__restrict__
prevDipoles
,
real
*
__restrict__
prevDipolesPolar
,
real
*
__restrict__
prevErrors
,
int
iteration
,
bool
recordPrevErrors
,
real
*
__restrict__
matrix
)
{
extern
__shared__
real2
buffer
[];
extern
__shared__
real2
buffer
[];
#ifdef USE_EWALD
#ifdef USE_EWALD
const
real
ewaldScale
=
(
4
/
(
real
)
3
)
*
(
EWALD_ALPHA
*
EWALD_ALPHA
*
EWALD_ALPHA
)
/
SQRT_PI
;
const
real
ewaldScale
=
(
4
/
(
real
)
3
)
*
(
EWALD_ALPHA
*
EWALD_ALPHA
*
EWALD_ALPHA
)
/
SQRT_PI
;
...
@@ -557,43 +557,38 @@ extern "C" __global__ void recordInducedDipolesForDIIS(const long long* __restri
...
@@ -557,43 +557,38 @@ extern "C" __global__ void recordInducedDipolesForDIIS(const long long* __restri
}
}
if
(
threadIdx
.
x
==
0
)
if
(
threadIdx
.
x
==
0
)
errors
[
blockIdx
.
x
]
=
make_float2
((
float
)
buffer
[
0
].
x
,
(
float
)
buffer
[
0
].
y
);
errors
[
blockIdx
.
x
]
=
make_float2
((
float
)
buffer
[
0
].
x
,
(
float
)
buffer
[
0
].
y
);
if
(
iteration
>=
MAX_PREV_DIIS_DIPOLES
&&
recordPrevErrors
&&
blockIdx
.
x
==
0
)
{
// Shift over the existing matrix elements.
for
(
int
i
=
0
;
i
<
MAX_PREV_DIIS_DIPOLES
-
1
;
i
++
)
{
if
(
threadIdx
.
x
<
MAX_PREV_DIIS_DIPOLES
-
1
)
matrix
[
threadIdx
.
x
+
i
*
MAX_PREV_DIIS_DIPOLES
]
=
matrix
[(
threadIdx
.
x
+
1
)
+
(
i
+
1
)
*
MAX_PREV_DIIS_DIPOLES
];
__syncthreads
();
}
}
}
}
extern
"C"
__global__
void
computeDIISMatrix
(
real
*
__restrict__
prevErrors
,
int
iteration
,
real
*
__restrict__
matrix
)
{
extern
"C"
__global__
void
computeDIISMatrix
(
real
*
__restrict__
prevErrors
,
int
iteration
,
real
*
__restrict__
matrix
)
{
extern
__shared__
real
sumBuffer
[];
extern
__shared__
real
sumBuffer
[];
int
rank
=
min
(
iteration
+
1
,
MAX_PREV_DIIS_DIPOLES
)
+
1
;
int
j
=
min
(
iteration
,
MAX_PREV_DIIS_DIPOLES
-
1
)
;
for
(
int
element
=
blockIdx
.
x
;
element
<
rank
*
rank
;
element
+=
gridDim
.
x
)
{
for
(
int
i
=
blockIdx
.
x
;
i
<=
j
;
i
+=
gridDim
.
x
)
{
// All the threads in this thread block work together to compute a single matrix element.
// All the threads in this thread block work together to compute a single matrix element.
int
i
=
element
/
rank
;
real
sum
=
0
;
int
j
=
element
-
i
*
rank
;
for
(
int
index
=
threadIdx
.
x
;
index
<
NUM_ATOMS
*
3
;
index
+=
blockDim
.
x
)
if
(
i
>
j
)
sum
+=
prevErrors
[
index
+
i
*
NUM_ATOMS
*
3
]
*
prevErrors
[
index
+
j
*
NUM_ATOMS
*
3
];
continue
;
sumBuffer
[
threadIdx
.
x
]
=
sum
;
real
value
;
__syncthreads
();
if
(
i
==
0
&&
j
==
0
)
for
(
int
offset
=
1
;
offset
<
blockDim
.
x
;
offset
*=
2
)
{
value
=
0
;
if
(
threadIdx
.
x
+
offset
<
blockDim
.
x
&&
(
threadIdx
.
x
&
(
2
*
offset
-
1
))
==
0
)
else
if
(
i
==
0
||
j
==
0
)
sumBuffer
[
threadIdx
.
x
]
+=
sumBuffer
[
threadIdx
.
x
+
offset
];
value
=
-
1
;
else
{
// Compute the inner product of the two error vectors.
real
sum
=
0
;
for
(
int
index
=
threadIdx
.
x
;
index
<
NUM_ATOMS
*
3
;
index
+=
blockDim
.
x
)
sum
+=
prevErrors
[
index
+
(
i
-
1
)
*
NUM_ATOMS
*
3
]
*
prevErrors
[
index
+
(
j
-
1
)
*
NUM_ATOMS
*
3
];
sumBuffer
[
threadIdx
.
x
]
=
sum
;
__syncthreads
();
__syncthreads
();
for
(
int
offset
=
1
;
offset
<
blockDim
.
x
;
offset
*=
2
)
{
if
(
threadIdx
.
x
+
offset
<
blockDim
.
x
&&
(
threadIdx
.
x
&
(
2
*
offset
-
1
))
==
0
)
sumBuffer
[
threadIdx
.
x
]
+=
sumBuffer
[
threadIdx
.
x
+
offset
];
__syncthreads
();
}
value
=
sumBuffer
[
0
];
}
}
__syncthreads
();
if
(
threadIdx
.
x
==
0
)
{
if
(
threadIdx
.
x
==
0
)
{
matrix
[
element
]
=
value
;
matrix
[
i
+
MAX_PREV_DIIS_DIPOLES
*
j
]
=
sumBuffer
[
0
]
;
if
(
i
!=
j
)
if
(
i
!=
j
)
matrix
[
j
*
rank
+
i
]
=
value
;
matrix
[
j
+
MAX_PREV_DIIS_DIPOLES
*
i
]
=
sumBuffer
[
0
]
;
}
}
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment