Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
b8cba79f
Commit
b8cba79f
authored
Nov 16, 2012
by
Peter Eastman
Browse files
Restrict the number of threads in GK kernels based on available shared memory
parent
549d62bd
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
28 additions
and
15 deletions
+28
-15
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+20
-7
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+1
-0
plugins/amoeba/platforms/cuda/src/kernels/amoebaGk.cu
plugins/amoeba/platforms/cuda/src/kernels/amoebaGk.cu
+7
-8
No files found.
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
View file @
b8cba79f
...
...
@@ -1644,12 +1644,27 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::initialize(const System& syst
}
params
->
upload
(
paramsVector
);
// Select the number of threads for each kernel.
double
computeBornSumThreadMemory
=
4
*
elementSize
+
3
*
sizeof
(
float
);
double
gkForceThreadMemory
=
24
*
elementSize
;
double
chainRuleThreadMemory
=
10
*
elementSize
;
double
ediffThreadMemory
=
28
*
elementSize
+
2
*
sizeof
(
float
)
+
3
*
sizeof
(
int
)
/
(
double
)
cu
.
TileSize
;
int
maxThreads
=
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
();
computeBornSumThreads
=
min
(
maxThreads
,
cu
.
computeThreadBlockSize
(
computeBornSumThreadMemory
));
gkForceThreads
=
min
(
maxThreads
,
cu
.
computeThreadBlockSize
(
gkForceThreadMemory
));
chainRuleThreads
=
min
(
maxThreads
,
cu
.
computeThreadBlockSize
(
chainRuleThreadMemory
));
ediffThreads
=
min
(
maxThreads
,
cu
.
computeThreadBlockSize
(
ediffThreadMemory
));
// Create the kernels.
map
<
string
,
string
>
defines
;
defines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
paddedNumAtoms
);
defines
[
"THREAD_BLOCK_SIZE"
]
=
cu
.
intToString
(
nb
.
getForceThreadBlockSize
());
defines
[
"BORN_SUM_THREAD_BLOCK_SIZE"
]
=
cu
.
intToString
(
computeBornSumThreads
);
defines
[
"GK_FORCE_THREAD_BLOCK_SIZE"
]
=
cu
.
intToString
(
gkForceThreads
);
defines
[
"CHAIN_RULE_THREAD_BLOCK_SIZE"
]
=
cu
.
intToString
(
chainRuleThreads
);
defines
[
"EDIFF_THREAD_BLOCK_SIZE"
]
=
cu
.
intToString
(
ediffThreads
);
defines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
defines
[
"GK_C"
]
=
cu
.
doubleToString
(
2.455
);
double
solventDielectric
=
force
.
getSolventDielectric
();
...
...
@@ -1710,10 +1725,9 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::computeBornRadii() {
CudaNonbondedUtilities
&
nb
=
cu
.
getNonbondedUtilities
();
int
numTiles
=
nb
.
getNumTiles
();
int
numForceThreadBlocks
=
nb
.
getNumForceThreadBlocks
();
int
forceThreadBlockSize
=
nb
.
getForceThreadBlockSize
();
void
*
computeBornSumArgs
[]
=
{
&
bornSum
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
params
->
getDevicePointer
(),
&
numTiles
};
cu
.
executeKernel
(
computeBornSumKernel
,
computeBornSumArgs
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
cu
.
executeKernel
(
computeBornSumKernel
,
computeBornSumArgs
,
numForceThreadBlocks
*
computeBornSumThreads
,
computeBornSumThreads
);
void
*
reduceBornSumArgs
[]
=
{
&
bornSum
->
getDevicePointer
(),
&
params
->
getDevicePointer
(),
&
bornRadii
->
getDevicePointer
()};
cu
.
executeKernel
(
reduceBornSumKernel
,
reduceBornSumArgs
,
cu
.
getNumAtoms
());
}
...
...
@@ -1724,7 +1738,6 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::finishComputation(CudaArray&
int
startTileIndex
=
nb
.
getStartTileIndex
();
int
numTileIndices
=
nb
.
getNumTiles
();
int
numForceThreadBlocks
=
nb
.
getNumForceThreadBlocks
();
int
forceThreadBlockSize
=
nb
.
getForceThreadBlockSize
();
// Compute the GK force.
...
...
@@ -1732,7 +1745,7 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::finishComputation(CudaArray&
&
cu
.
getPosq
().
getDevicePointer
(),
&
startTileIndex
,
&
numTileIndices
,
&
labFrameDipoles
.
getDevicePointer
(),
&
labFrameQuadrupoles
.
getDevicePointer
(),
&
inducedDipoleS
->
getDevicePointer
(),
&
inducedDipolePolarS
->
getDevicePointer
(),
&
bornRadii
->
getDevicePointer
(),
&
bornForce
->
getDevicePointer
()};
cu
.
executeKernel
(
gkForceKernel
,
gkForceArgs
,
numForceThreadBlocks
*
f
orceThread
BlockSize
,
f
orceThread
BlockSize
);
cu
.
executeKernel
(
gkForceKernel
,
gkForceArgs
,
numForceThreadBlocks
*
gkF
orceThread
s
,
gkF
orceThread
s
);
// Compute the surface area force.
...
...
@@ -1745,14 +1758,14 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::finishComputation(CudaArray&
void
*
chainRuleArgs
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
startTileIndex
,
&
numTileIndices
,
&
params
->
getDevicePointer
(),
&
bornRadii
->
getDevicePointer
(),
&
bornForce
->
getDevicePointer
()};
cu
.
executeKernel
(
chainRuleKernel
,
chainRuleArgs
,
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
cu
.
executeKernel
(
chainRuleKernel
,
chainRuleArgs
,
numForceThreadBlocks
*
chainRuleThreads
,
chainRuleThreads
);
void
*
ediffArgs
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
torque
.
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
nb
.
getExclusionIndices
().
getDevicePointer
(),
&
nb
.
getExclusionRowIndices
().
getDevicePointer
(),
&
covalentFlags
.
getDevicePointer
(),
&
polarizationGroupFlags
.
getDevicePointer
(),
&
startTileIndex
,
&
numTileIndices
,
&
labFrameDipoles
.
getDevicePointer
(),
&
labFrameQuadrupoles
.
getDevicePointer
(),
&
inducedDipole
.
getDevicePointer
(),
&
inducedDipolePolar
.
getDevicePointer
(),
&
inducedDipoleS
->
getDevicePointer
(),
&
inducedDipolePolarS
->
getDevicePointer
(),
&
dampingAndThole
.
getDevicePointer
()};
cu
.
executeKernel
(
ediffKernel
,
ediffArgs
,
numForceThreadBlocks
*
force
Thread
BlockSize
,
forceThreadBlockSize
);
cu
.
executeKernel
(
ediffKernel
,
ediffArgs
,
numForceThreadBlocks
*
ediff
Thread
s
,
ediffThreads
);
}
/* -------------------------------------------------------------------------- *
...
...
plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
View file @
b8cba79f
...
...
@@ -424,6 +424,7 @@ private:
CudaContext
&
cu
;
System
&
system
;
bool
includeSurfaceArea
;
int
computeBornSumThreads
,
gkForceThreads
,
chainRuleThreads
,
ediffThreads
;
CudaArray
*
params
;
CudaArray
*
bornSum
;
CudaArray
*
bornRadii
;
...
...
plugins/amoeba/platforms/cuda/src/kernels/amoebaGk.cu
View file @
b8cba79f
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
/**
* Reduce the Born sums to compute the Born radii.
...
...
@@ -93,7 +92,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ bornS
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData1
localData
[
THREAD_BLOCK_SIZE
];
__shared__
AtomData1
localData
[
BORN_SUM_
THREAD_BLOCK_SIZE
];
do
{
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
...
...
@@ -227,7 +226,7 @@ extern "C" __global__ void computeGKForces(
unsigned
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
real
energy
=
0
;
__shared__
AtomData2
localData
[
THREAD_BLOCK_SIZE
];
__shared__
AtomData2
localData
[
GK_FORCE_
THREAD_BLOCK_SIZE
];
do
{
// Extract the coordinates of this tile
...
...
@@ -466,7 +465,7 @@ extern "C" __global__ void computeChainRuleForce(
const
unsigned
int
numTiles
=
numTileIndices
;
unsigned
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
__shared__
AtomData3
localData
[
THREAD_BLOCK_SIZE
];
__shared__
AtomData3
localData
[
CHAIN_RULE_
THREAD_BLOCK_SIZE
];
do
{
// Extract the coordinates of this tile
...
...
@@ -551,7 +550,7 @@ typedef struct {
real3
pos
,
force
,
dipole
,
inducedDipole
,
inducedDipolePolar
,
inducedDipoleS
,
inducedDipolePolarS
;
real
q
,
quadrupoleXX
,
quadrupoleXY
,
quadrupoleXZ
;
real
quadrupoleYY
,
quadrupoleYZ
,
quadrupoleZZ
;
float
thole
,
damp
,
padding
;
float
thole
,
damp
;
}
AtomData4
;
__device__
void
computeOneEDiffInteractionF1
(
AtomData4
&
atom1
,
volatile
AtomData4
&
atom2
,
float
dScale
,
float
pScale
,
real
&
outputEnergy
,
real3
&
outputForce
);
...
...
@@ -618,9 +617,9 @@ extern "C" __global__ void computeEDiffForce(
unsigned
int
pos
=
startTileIndex
+
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
startTileIndex
+
(
warp
+
1
)
*
numTiles
/
totalWarps
;
real
energy
=
0
;
__shared__
AtomData4
localData
[
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
__shared__
AtomData4
localData
[
EDIFF_
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
(
EDIFF_THREAD_BLOCK_SIZE
/
TILE_SIZE
)
];
__shared__
int
exclusionIndex
[
EDIFF_THREAD_BLOCK_SIZE
/
TILE_SIZE
];
do
{
// Extract the coordinates of this tile
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment