Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
6d82a920
Commit
6d82a920
authored
Oct 20, 2012
by
Peter Eastman
Browse files
Limit thread block size to avoid exceeding available shared memory
parent
ff7f1035
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
55 additions
and
11 deletions
+55
-11
platforms/cuda/src/CudaContext.cpp
platforms/cuda/src/CudaContext.cpp
+13
-0
platforms/cuda/src/CudaContext.h
platforms/cuda/src/CudaContext.h
+8
-0
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+33
-11
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+1
-0
No files found.
platforms/cuda/src/CudaContext.cpp
View file @
6d82a920
...
...
@@ -493,6 +493,19 @@ void CudaContext::executeKernel(CUfunction kernel, void** arguments, int threads
}
}
int
CudaContext
::
computeThreadBlockSize
(
double
memory
,
bool
preferShared
)
const
{
int
maxShared
=
16
*
1024
;
if
(
computeCapability
>=
2.0
&&
preferShared
)
maxShared
=
48
*
1024
;
int
max
=
(
int
)
(
maxShared
/
memory
);
if
(
max
<
64
)
return
32
;
int
threads
=
64
;
while
(
threads
+
64
<
max
)
threads
+=
64
;
return
threads
;
}
void
CudaContext
::
clearBuffer
(
CudaArray
&
array
)
{
clearBuffer
(
array
.
getDevicePointer
(),
array
.
getSize
()
*
array
.
getElementSize
());
}
...
...
platforms/cuda/src/CudaContext.h
View file @
6d82a920
...
...
@@ -226,6 +226,14 @@ public:
* @param sharedSize the amount of dynamic shared memory to allocated for the kernel, in bytes
*/
void
executeKernel
(
CUfunction
kernel
,
void
**
arguments
,
int
workUnits
,
int
blockSize
=
-
1
,
unsigned
int
sharedSize
=
0
);
/**
* Compute the largest thread block size that can be used for a kernel that requires a particular amount of
* shared memory per thread.
*
* @param memory the number of bytes of shared memory per thread
* @param preferShared whether the kernel is set to prefer shared memory over cache
*/
int
computeThreadBlockSize
(
double
memory
,
bool
preferShared
=
true
)
const
;
/**
* Set all elements of an array to 0.
*/
...
...
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
View file @
6d82a920
...
...
@@ -408,6 +408,7 @@ void CudaCalcAmoebaStretchBendForceKernel::initialize(const System& system, cons
}
double
CudaCalcAmoebaStretchBendForceKernel
::
execute
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
)
{
return
0.0
;
}
/* -------------------------------------------------------------------------- *
...
...
@@ -841,10 +842,15 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
// Create the kernels.
bool
useShuffle
=
(
cu
.
getComputeCapability
()
>=
3.0
&&
!
cu
.
getUseDoublePrecision
());
double
fixedThreadMemory
=
19
*
elementSize
+
2
*
sizeof
(
float
)
+
3
*
sizeof
(
int
)
/
(
double
)
cu
.
TileSize
;
double
inducedThreadMemory
=
15
*
elementSize
+
2
*
sizeof
(
float
);
double
electrostaticsThreadMemory
=
0
;
if
(
useShuffle
)
fixedThreadMemory
+=
3
*
elementSize
;
map
<
string
,
string
>
defines
;
defines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
numMultipoles
);
defines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
defines
[
"THREAD_BLOCK_SIZE"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
());
defines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
defines
[
"ENERGY_SCALE_FACTOR"
]
=
cu
.
doubleToString
(
138.9354558456
/
innerDielectric
);
if
(
force
.
getPolarizationType
()
==
AmoebaMultipoleForce
::
Direct
)
...
...
@@ -881,15 +887,22 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
defines
[
"GK_FC"
]
=
cu
.
doubleToString
(
1
*
(
1
-
solventDielectric
)
/
(
0
+
1
*
solventDielectric
));
defines
[
"GK_FD"
]
=
cu
.
doubleToString
(
2
*
(
1
-
solventDielectric
)
/
(
1
+
2
*
solventDielectric
));
defines
[
"GK_FQ"
]
=
cu
.
doubleToString
(
3
*
(
1
-
solventDielectric
)
/
(
2
+
3
*
solventDielectric
));
fixedThreadMemory
+=
4
*
elementSize
;
inducedThreadMemory
+=
13
*
elementSize
;
}
int
maxThreads
=
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
();
fixedFieldThreads
=
min
(
maxThreads
,
cu
.
computeThreadBlockSize
(
fixedThreadMemory
));
inducedFieldThreads
=
min
(
maxThreads
,
cu
.
computeThreadBlockSize
(
inducedThreadMemory
));
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaAmoebaKernelSources
::
multipoles
,
defines
);
computeMomentsKernel
=
cu
.
getKernel
(
module
,
"computeLabFrameMoments"
);
recordInducedDipolesKernel
=
cu
.
getKernel
(
module
,
"recordInducedDipoles"
);
mapTorqueKernel
=
cu
.
getKernel
(
module
,
"mapTorqueToForce"
);
computePotentialKernel
=
cu
.
getKernel
(
module
,
"computePotentialAtPoints"
);
defines
[
"THREAD_BLOCK_SIZE"
]
=
cu
.
intToString
(
fixedFieldThreads
);
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaAmoebaKernelSources
::
multipoleFixedField
,
defines
);
computeFixedFieldKernel
=
cu
.
getKernel
(
module
,
"computeFixedField"
);
if
(
maxInducedIterations
>
0
)
{
defines
[
"THREAD_BLOCK_SIZE"
]
=
cu
.
intToString
(
inducedFieldThreads
);
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaAmoebaKernelSources
::
multipoleInducedField
,
defines
);
computeInducedFieldKernel
=
cu
.
getKernel
(
module
,
"computeInducedField"
);
updateInducedFieldKernel
=
cu
.
getKernel
(
module
,
"updateInducedFieldBySOR"
);
...
...
@@ -901,6 +914,9 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
electrostaticsSource
<<
CudaAmoebaKernelSources
::
pmeElectrostaticPairForce
;
electrostaticsSource
<<
"#define APPLY_SCALE
\n
"
;
electrostaticsSource
<<
CudaAmoebaKernelSources
::
pmeElectrostaticPairForce
;
electrostaticsThreadMemory
=
24
*
elementSize
+
3
*
sizeof
(
float
)
+
3
*
sizeof
(
int
)
/
(
double
)
cu
.
TileSize
;
if
(
useShuffle
)
electrostaticsThreadMemory
+=
3
*
elementSize
;
}
else
{
electrostaticsSource
<<
CudaKernelSources
::
vectorOps
;
...
...
@@ -913,7 +929,14 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
electrostaticsSource
<<
"#undef T1
\n
"
;
electrostaticsSource
<<
"#define T3
\n
"
;
electrostaticsSource
<<
CudaAmoebaKernelSources
::
electrostaticPairForce
;
}
electrostaticsThreadMemory
=
21
*
elementSize
+
2
*
sizeof
(
float
)
+
3
*
sizeof
(
int
)
/
(
double
)
cu
.
TileSize
;
if
(
useShuffle
)
electrostaticsThreadMemory
+=
3
*
elementSize
;
if
(
gk
!=
NULL
)
electrostaticsThreadMemory
+=
4
*
elementSize
;
}
electrostaticsThreads
=
min
(
maxThreads
,
cu
.
computeThreadBlockSize
(
electrostaticsThreadMemory
));
defines
[
"THREAD_BLOCK_SIZE"
]
=
cu
.
intToString
(
electrostaticsThreads
);
module
=
cu
.
createModule
(
electrostaticsSource
.
str
(),
defines
);
electrostaticsKernel
=
cu
.
getKernel
(
module
,
"computeElectrostatics"
);
...
...
@@ -1151,7 +1174,6 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
int
startTileIndex
=
nb
.
getStartTileIndex
();
int
numTileIndices
=
nb
.
getNumTiles
();
int
numForceThreadBlocks
=
nb
.
getNumForceThreadBlocks
();
int
forceThreadBlockSize
=
nb
.
getForceThreadBlockSize
();
int
elementSize
=
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
void
*
npt
=
NULL
;
if
(
pmeGrid
==
NULL
)
{
...
...
@@ -1162,7 +1184,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&
nb
.
getExclusionIndices
().
getDevicePointer
(),
&
nb
.
getExclusionRowIndices
().
getDevicePointer
(),
&
covalentFlags
->
getDevicePointer
(),
&
polarizationGroupFlags
->
getDevicePointer
(),
&
startTileIndex
,
&
numTileIndices
,
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
dampingAndThole
->
getDevicePointer
()};
cu
.
executeKernel
(
computeFixedFieldKernel
,
computeFixedFieldArgs
,
numForceThreadBlocks
*
f
orceThreadBlockSize
,
forceThreadBlockSize
);
cu
.
executeKernel
(
computeFixedFieldKernel
,
computeFixedFieldArgs
,
numForceThreadBlocks
*
f
ixedFieldThreads
,
fixedFieldThreads
);
void
*
recordInducedDipolesArgs
[]
=
{
&
field
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
polarizability
->
getDevicePointer
()};
cu
.
executeKernel
(
recordInducedDipolesKernel
,
recordInducedDipolesArgs
,
cu
.
getNumAtoms
());
...
...
@@ -1174,7 +1196,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&
covalentFlags
->
getDevicePointer
(),
&
polarizationGroupFlags
->
getDevicePointer
(),
&
startTileIndex
,
&
numTileIndices
,
&
gkKernel
->
getBornRadii
()
->
getDevicePointer
(),
&
gkKernel
->
getField
()
->
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
dampingAndThole
->
getDevicePointer
()};
cu
.
executeKernel
(
computeFixedFieldKernel
,
computeFixedFieldArgs
,
numForceThreadBlocks
*
f
orceThreadBlockSize
,
forceThreadBlockSize
);
cu
.
executeKernel
(
computeFixedFieldKernel
,
computeFixedFieldArgs
,
numForceThreadBlocks
*
f
ixedFieldThreads
,
fixedFieldThreads
);
void
*
recordInducedDipolesArgs
[]
=
{
&
field
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
&
gkKernel
->
getField
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedDipoles
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedDipolesPolar
()
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
...
...
@@ -1192,7 +1214,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
void
*
computeInducedFieldArgs
[]
=
{
&
inducedField
->
getDevicePointer
(),
&
inducedFieldPolar
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
startTileIndex
,
&
numTileIndices
,
&
dampingAndThole
->
getDevicePointer
()};
cu
.
executeKernel
(
computeInducedFieldKernel
,
computeInducedFieldArgs
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
cu
.
executeKernel
(
computeInducedFieldKernel
,
computeInducedFieldArgs
,
numForceThreadBlocks
*
inducedFieldThreads
,
inducedFieldThreads
);
}
else
{
cu
.
clearBuffer
(
*
gkKernel
->
getInducedField
());
...
...
@@ -1202,7 +1224,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&
gkKernel
->
getInducedField
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedFieldPolar
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedDipoles
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedDipolesPolar
()
->
getDevicePointer
(),
&
gkKernel
->
getBornRadii
()
->
getDevicePointer
(),
&
dampingAndThole
->
getDevicePointer
()};
cu
.
executeKernel
(
computeInducedFieldKernel
,
computeInducedFieldArgs
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
cu
.
executeKernel
(
computeInducedFieldKernel
,
computeInducedFieldArgs
,
numForceThreadBlocks
*
inducedFieldThreads
,
inducedFieldThreads
);
void
*
updateInducedGkFieldArgs
[]
=
{
&
field
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
&
gkKernel
->
getField
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedField
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedFieldPolar
()
->
getDevicePointer
(),
&
gkKernel
->
getInducedDipoles
()
->
getDevicePointer
(),
...
...
@@ -1230,7 +1252,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&
covalentFlags
->
getDevicePointer
(),
&
polarizationGroupFlags
->
getDevicePointer
(),
&
startTileIndex
,
&
numTileIndices
,
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
dampingAndThole
->
getDevicePointer
()};
cu
.
executeKernel
(
electrostaticsKernel
,
electrostaticsArgs
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
cu
.
executeKernel
(
electrostaticsKernel
,
electrostaticsArgs
,
numForceThreadBlocks
*
electrostaticsThreads
,
electrostaticsThreads
);
if
(
gkKernel
!=
NULL
)
gkKernel
->
finishComputation
(
*
torque
,
*
labFrameDipoles
,
*
labFrameQuadrupoles
,
*
inducedDipole
,
*
inducedDipolePolar
,
*
dampingAndThole
,
*
covalentFlags
,
*
polarizationGroupFlags
);
}
...
...
@@ -1280,7 +1302,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&
nb
.
getInteractingTiles
().
getDevicePointer
(),
&
nb
.
getInteractionCount
().
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
(),
&
maxTiles
,
&
nb
.
getInteractionFlags
().
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
dampingAndThole
->
getDevicePointer
()};
cu
.
executeKernel
(
computeFixedFieldKernel
,
computeFixedFieldArgs
,
numForceThreadBlocks
*
f
orceThreadBlockSize
,
forceThreadBlockSize
);
cu
.
executeKernel
(
computeFixedFieldKernel
,
computeFixedFieldArgs
,
numForceThreadBlocks
*
f
ixedFieldThreads
,
fixedFieldThreads
);
void
*
recordInducedDipolesArgs
[]
=
{
&
field
->
getDevicePointer
(),
&
fieldPolar
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
polarizability
->
getDevicePointer
()};
cu
.
executeKernel
(
recordInducedDipolesKernel
,
recordInducedDipolesArgs
,
cu
.
getNumAtoms
());
...
...
@@ -1316,7 +1338,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
&
nb
.
getInteractingTiles
().
getDevicePointer
(),
&
nb
.
getInteractionCount
().
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
(),
&
maxTiles
,
&
nb
.
getInteractionFlags
().
getDevicePointer
(),
&
dampingAndThole
->
getDevicePointer
()};
cu
.
executeKernel
(
computeInducedFieldKernel
,
computeInducedFieldArgs
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
cu
.
executeKernel
(
computeInducedFieldKernel
,
computeInducedFieldArgs
,
numForceThreadBlocks
*
inducedFieldThreads
,
inducedFieldThreads
);
cu
.
executeKernel
(
pmeSpreadInducedDipolesKernel
,
pmeSpreadInducedDipolesArgs
,
cu
.
getNumAtoms
());
if
(
cu
.
getUseDoublePrecision
())
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
...
...
@@ -1354,7 +1376,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
(),
&
maxTiles
,
&
nb
.
getInteractionFlags
().
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
dampingAndThole
->
getDevicePointer
()};
cu
.
executeKernel
(
electrostaticsKernel
,
electrostaticsArgs
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
cu
.
executeKernel
(
electrostaticsKernel
,
electrostaticsArgs
,
numForceThreadBlocks
*
electrostaticsThreads
,
electrostaticsThreads
);
void
*
pmeInducedForceArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
torque
->
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
labFrameDipoles
->
getDevicePointer
(),
&
labFrameQuadrupoles
->
getDevicePointer
(),
&
inducedDipole
->
getDevicePointer
(),
&
inducedDipolePolar
->
getDevicePointer
(),
&
pmePhi
->
getDevicePointer
(),
&
pmePhid
->
getDevicePointer
(),
...
...
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
View file @
6d82a920
...
...
@@ -323,6 +323,7 @@ private:
void
initializeScaleFactors
();
template
<
class
T
,
class
T4
,
class
M4
>
void
computeSystemMultipoleMoments
(
ContextImpl
&
context
,
std
::
vector
<
double
>&
outputMultipoleMoments
);
int
numMultipoles
,
maxInducedIterations
;
int
fixedFieldThreads
,
inducedFieldThreads
,
electrostaticsThreads
;
double
inducedEpsilon
;
bool
hasInitializedScaleFactors
,
hasInitializedFFT
;
CudaContext
&
cu
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment