Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
388bf972
Commit
388bf972
authored
Oct 14, 2016
by
peastman
Committed by
GitHub
Oct 14, 2016
Browse files
Merge pull request #1632 from peastman/pairs
Fine grained pair list of nonbonded interactions
parents
064a781f
7c911459
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
226 additions
and
71 deletions
+226
-71
platforms/cuda/include/CudaNonbondedUtilities.h
platforms/cuda/include/CudaNonbondedUtilities.h
+19
-10
platforms/cuda/include/CudaParallelKernels.h
platforms/cuda/include/CudaParallelKernels.h
+1
-1
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+2
-2
platforms/cuda/src/CudaNonbondedUtilities.cpp
platforms/cuda/src/CudaNonbondedUtilities.cpp
+45
-23
platforms/cuda/src/CudaParallelKernels.cpp
platforms/cuda/src/CudaParallelKernels.cpp
+15
-14
platforms/cuda/src/kernels/findInteractingBlocks.cu
platforms/cuda/src/kernels/findInteractingBlocks.cu
+88
-19
platforms/cuda/src/kernels/nonbonded.cu
platforms/cuda/src/kernels/nonbonded.cu
+56
-2
No files found.
platforms/cuda/include/CudaNonbondedUtilities.h
View file @
388bf972
...
...
@@ -78,8 +78,9 @@ public:
* @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded
* @param kernel the code to evaluate the interaction
* @param forceGroup the force group in which the interaction should be calculated
* @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list
*/
void
addInteraction
(
bool
usesCutoff
,
bool
usesPeriodic
,
bool
usesExclusions
,
double
cutoffDistance
,
const
std
::
vector
<
std
::
vector
<
int
>
>&
exclusionList
,
const
std
::
string
&
kernel
,
int
forceGroup
);
void
addInteraction
(
bool
usesCutoff
,
bool
usesPeriodic
,
bool
usesExclusions
,
double
cutoffDistance
,
const
std
::
vector
<
std
::
vector
<
int
>
>&
exclusionList
,
const
std
::
string
&
kernel
,
int
forceGroup
,
bool
supportsPairList
=
false
);
/**
* Add a per-atom parameter that the default interaction kernel may depend on.
*/
...
...
@@ -189,6 +190,12 @@ public:
CudaArray
&
getInteractingAtoms
()
{
return
*
interactingAtoms
;
}
/**
* Get the array containing single pairs in the neighbor list.
*/
CudaArray
&
getSinglePairs
()
{
return
*
singlePairs
;
}
/**
* Get the array containing exclusion flags.
*/
...
...
@@ -270,6 +277,8 @@ private:
CudaArray
*
interactingTiles
;
CudaArray
*
interactingAtoms
;
CudaArray
*
interactionCount
;
CudaArray
*
singlePairs
;
CudaArray
*
singlePairCount
;
CudaArray
*
blockCenter
;
CudaArray
*
blockBoundingBox
;
CudaArray
*
sortedBlocks
;
...
...
@@ -288,8 +297,8 @@ private:
std
::
map
<
int
,
double
>
groupCutoff
;
std
::
map
<
int
,
std
::
string
>
groupKernelSource
;
double
lastCutoff
;
bool
useCutoff
,
usePeriodic
,
anyExclusions
,
usePadding
,
forceRebuildNeighborList
;
int
startTileIndex
,
numTiles
,
startBlockIndex
,
numBlocks
,
maxTiles
,
maxExclusions
,
numForceThreadBlocks
,
forceThreadBlockSize
,
numAtoms
,
groupFlags
;
bool
useCutoff
,
usePeriodic
,
anyExclusions
,
usePadding
,
forceRebuildNeighborList
,
canUsePairList
;
int
startTileIndex
,
numTiles
,
startBlockIndex
,
numBlocks
,
maxTiles
,
maxSinglePairs
,
maxExclusions
,
numForceThreadBlocks
,
forceThreadBlockSize
,
numAtoms
,
groupFlags
;
};
/**
...
...
platforms/cuda/include/CudaParallelKernels.h
View file @
388bf972
...
...
@@ -83,7 +83,7 @@ private:
std
::
vector
<
Kernel
>
kernels
;
std
::
vector
<
long
long
>
completionTimes
;
std
::
vector
<
double
>
contextNonbondedFractions
;
int
*
tile
Counts
;
int
2
*
interaction
Counts
;
CudaArray
*
contextForces
;
void
*
pinnedPositionBuffer
;
long
long
*
pinnedForceBuffer
;
...
...
platforms/cuda/src/CudaKernels.cpp
View file @
388bf972
...
...
@@ -1920,7 +1920,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Add the interaction to the default nonbonded kernel.
string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
cu
.
getNonbondedUtilities
().
addInteraction
(
useCutoff
,
usePeriodic
,
true
,
force
.
getCutoffDistance
(),
exclusionList
,
source
,
force
.
getForceGroup
());
cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup()
, true
);
if (hasLJ)
cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon->getDevicePointer()));
...
...
@@ -2314,7 +2314,7 @@ void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const
if (force.getNumInteractionGroups() > 0)
initInteractionGroups(force, source, tableTypes);
else {
cu
.
getNonbondedUtilities
().
addInteraction
(
useCutoff
,
usePeriodic
,
true
,
force
.
getCutoffDistance
(),
exclusionList
,
source
,
force
.
getForceGroup
());
cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup()
, true
);
for (int i = 0; i < (int) params->getBuffers().size(); i++) {
CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo(prefix+"params"+cu.intToString(i+1), buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));
...
...
platforms/cuda/src/CudaNonbondedUtilities.cpp
View file @
388bf972
...
...
@@ -64,15 +64,16 @@ private:
CudaNonbondedUtilities
::
CudaNonbondedUtilities
(
CudaContext
&
context
)
:
context
(
context
),
useCutoff
(
false
),
usePeriodic
(
false
),
anyExclusions
(
false
),
usePadding
(
true
),
exclusionIndices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusionTiles
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactingAtoms
(
NULL
),
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
sortedBlocks
(
NULL
),
sortedBlockCenter
(
NULL
),
sortedBlockBoundingBox
(
NULL
),
oldPositions
(
NULL
),
rebuildNeighborList
(
NULL
),
blockSorter
(
NULL
),
pinnedCountBuffer
(
NULL
),
forceRebuildNeighborList
(
true
),
lastCutoff
(
0.0
),
groupFlags
(
0
)
{
interactionCount
(
NULL
),
singlePairs
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
sortedBlocks
(
NULL
),
sortedBlockCenter
(
NULL
),
sortedBlockBoundingBox
(
NULL
),
oldPositions
(
NULL
),
rebuildNeighborList
(
NULL
),
blockSorter
(
NULL
),
pinnedCountBuffer
(
NULL
),
forceRebuildNeighborList
(
true
),
lastCutoff
(
0.0
),
groupFlags
(
0
),
canUsePairList
(
true
)
{
// Decide how many thread blocks to use.
string
errorMessage
=
"Error initializing nonbonded utilities"
;
int
multiprocessors
;
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
context
.
getDevice
()));
CHECK_RESULT
(
cuEventCreate
(
&
downloadCountEvent
,
0
));
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
pinnedCountBuffer
,
sizeof
(
int
),
CU_MEMHOSTALLOC_PORTABLE
));
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
pinnedCountBuffer
,
2
*
sizeof
(
int
),
CU_MEMHOSTALLOC_PORTABLE
));
numForceThreadBlocks
=
4
*
multiprocessors
;
forceThreadBlockSize
=
(
context
.
getComputeCapability
()
<
2.0
?
128
:
256
);
}
...
...
@@ -92,6 +93,8 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
delete
interactingAtoms
;
if
(
interactionCount
!=
NULL
)
delete
interactionCount
;
if
(
singlePairs
!=
NULL
)
delete
singlePairs
;
if
(
blockCenter
!=
NULL
)
delete
blockCenter
;
if
(
blockBoundingBox
!=
NULL
)
...
...
@@ -113,7 +116,7 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
cuEventDestroy
(
downloadCountEvent
);
}
void
CudaNonbondedUtilities
::
addInteraction
(
bool
usesCutoff
,
bool
usesPeriodic
,
bool
usesExclusions
,
double
cutoffDistance
,
const
vector
<
vector
<
int
>
>&
exclusionList
,
const
string
&
kernel
,
int
forceGroup
)
{
void
CudaNonbondedUtilities
::
addInteraction
(
bool
usesCutoff
,
bool
usesPeriodic
,
bool
usesExclusions
,
double
cutoffDistance
,
const
vector
<
vector
<
int
>
>&
exclusionList
,
const
string
&
kernel
,
int
forceGroup
,
bool
supportsPairList
)
{
if
(
groupCutoff
.
size
()
>
0
)
{
if
(
usesCutoff
!=
useCutoff
)
throw
OpenMMException
(
"All Forces must agree on whether to use a cutoff"
);
...
...
@@ -128,6 +131,7 @@ void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic,
usePeriodic
=
usesPeriodic
;
groupCutoff
[
forceGroup
]
=
cutoffDistance
;
groupFlags
|=
1
<<
forceGroup
;
canUsePairList
&=
supportsPairList
;
if
(
kernel
.
size
()
>
0
)
{
if
(
groupKernelSource
.
find
(
forceGroup
)
==
groupKernelSource
.
end
())
groupKernelSource
[
forceGroup
]
=
""
;
...
...
@@ -279,9 +283,11 @@ void CudaNonbondedUtilities::initialize(const System& system) {
maxTiles
=
numTiles
;
if
(
maxTiles
<
1
)
maxTiles
=
1
;
maxSinglePairs
=
5
*
numAtoms
;
interactingTiles
=
CudaArray
::
create
<
int
>
(
context
,
maxTiles
,
"interactingTiles"
);
interactingAtoms
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interactingAtoms"
);
interactionCount
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
1
,
"interactionCount"
);
interactionCount
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
2
,
"interactionCount"
);
singlePairs
=
CudaArray
::
create
<
int2
>
(
context
,
maxSinglePairs
,
"singlePairs"
);
int
elementSize
=
(
context
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
blockCenter
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockCenter"
);
blockBoundingBox
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockBoundingBox"
);
...
...
@@ -291,7 +297,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
oldPositions
=
new
CudaArray
(
context
,
numAtoms
,
4
*
elementSize
,
"oldPositions"
);
rebuildNeighborList
=
CudaArray
::
create
<
int
>
(
context
,
1
,
"rebuildNeighborList"
);
blockSorter
=
new
CudaSort
(
context
,
new
BlockSortTrait
(
context
.
getUseDoublePrecision
()),
numAtomBlocks
);
vector
<
unsigned
int
>
count
(
1
,
0
);
vector
<
unsigned
int
>
count
(
2
,
0
);
interactionCount
->
upload
(
count
);
}
...
...
@@ -316,6 +322,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
forceArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
forceArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
forceArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
forceArgs
.
push_back
(
&
maxSinglePairs
);
forceArgs
.
push_back
(
&
singlePairs
->
getDevicePointer
());
}
for
(
int
i
=
0
;
i
<
(
int
)
parameters
.
size
();
i
++
)
forceArgs
.
push_back
(
&
parameters
[
i
].
getMemory
());
...
...
@@ -353,8 +361,10 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingTiles
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
singlePairs
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
maxTiles
);
findInteractingBlocksArgs
.
push_back
(
&
maxSinglePairs
);
findInteractingBlocksArgs
.
push_back
(
&
startBlockIndex
);
findInteractingBlocksArgs
.
push_back
(
&
numBlocks
);
findInteractingBlocksArgs
.
push_back
(
&
sortedBlocks
->
getDevicePointer
());
...
...
@@ -424,12 +434,13 @@ void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeFo
bool
CudaNonbondedUtilities
::
updateNeighborListSize
()
{
if
(
!
useCutoff
)
return
false
;
if
(
pinnedCountBuffer
[
0
]
<=
(
unsigned
int
)
maxTile
s
)
if
(
pinnedCountBuffer
[
0
]
<=
maxTiles
&&
pinnedCountBuffer
[
1
]
<=
maxSinglePair
s
)
return
false
;
// The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent
// this from happening in the future.
if
(
pinnedCountBuffer
[
0
]
>
maxTiles
)
{
maxTiles
=
(
int
)
(
1.2
*
pinnedCountBuffer
[
0
]);
int
totalTiles
=
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
;
if
(
maxTiles
>
totalTiles
)
...
...
@@ -446,6 +457,16 @@ bool CudaNonbondedUtilities::updateNeighborListSize() {
if
(
forceArgs
.
size
()
>
0
)
forceArgs
[
17
]
=
&
interactingAtoms
->
getDevicePointer
();
findInteractingBlocksArgs
[
7
]
=
&
interactingAtoms
->
getDevicePointer
();
}
if
(
pinnedCountBuffer
[
1
]
>
maxSinglePairs
)
{
maxSinglePairs
=
(
int
)
(
1.2
*
pinnedCountBuffer
[
1
]);
delete
singlePairs
;
singlePairs
=
NULL
;
// Avoid an error in the destructor if the following allocation fails
singlePairs
=
CudaArray
::
create
<
int2
>
(
context
,
maxSinglePairs
,
"singlePairs"
);
if
(
forceArgs
.
size
()
>
0
)
forceArgs
[
19
]
=
&
singlePairs
->
getDevicePointer
();
findInteractingBlocksArgs
[
8
]
=
&
singlePairs
->
getDevicePointer
();
}
forceRebuildNeighborList
=
true
;
context
.
setForcesValid
(
false
);
return
true
;
...
...
@@ -493,6 +514,7 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
if
(
usePeriodic
)
defines
[
"USE_PERIODIC"
]
=
"1"
;
defines
[
"MAX_EXCLUSIONS"
]
=
context
.
intToString
(
maxExclusions
);
defines
[
"MAX_BITS_FOR_PAIRS"
]
=
(
canUsePairList
?
"2"
:
"0"
);
CUmodule
interactingBlocksProgram
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
findInteractingBlocks
,
defines
);
kernels
.
findBlockBoundsKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"findBlockBounds"
);
kernels
.
sortBoxDataKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"sortBoxData"
);
...
...
platforms/cuda/src/CudaParallelKernels.cpp
View file @
388bf972
...
...
@@ -63,8 +63,8 @@ if (result != CUDA_SUCCESS) { \
class
CudaParallelCalcForcesAndEnergyKernel
::
BeginComputationTask
:
public
CudaContext
::
WorkTask
{
public:
BeginComputationTask
(
ContextImpl
&
context
,
CudaContext
&
cu
,
CudaCalcForcesAndEnergyKernel
&
kernel
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
void
*
pinnedMemory
,
CUevent
event
,
int
&
numTiles
)
:
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
pinnedMemory
(
pinnedMemory
),
event
(
event
),
numTiles
(
numTiles
)
{
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
void
*
pinnedMemory
,
CUevent
event
,
int
2
&
interactionCount
)
:
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
pinnedMemory
(
pinnedMemory
),
event
(
event
),
interactionCount
(
interactionCount
)
{
}
void
execute
()
{
// Copy coordinates over to this device and execute the kernel.
...
...
@@ -77,7 +77,7 @@ public:
}
kernel
.
beginComputation
(
context
,
includeForce
,
includeEnergy
,
groups
);
if
(
cu
.
getNonbondedUtilities
().
getUsePeriodic
())
cu
.
getNonbondedUtilities
().
getInteractionCount
().
download
(
&
numTiles
,
false
);
cu
.
getNonbondedUtilities
().
getInteractionCount
().
download
(
&
interactionCount
,
false
);
}
private:
ContextImpl
&
context
;
...
...
@@ -87,15 +87,15 @@ private:
int
groups
;
void
*
pinnedMemory
;
CUevent
event
;
int
&
numTiles
;
int
2
&
interactionCount
;
};
class
CudaParallelCalcForcesAndEnergyKernel
::
FinishComputationTask
:
public
CudaContext
::
WorkTask
{
public:
FinishComputationTask
(
ContextImpl
&
context
,
CudaContext
&
cu
,
CudaCalcForcesAndEnergyKernel
&
kernel
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
double
&
energy
,
long
long
&
completionTime
,
long
long
*
pinnedMemory
,
CudaArray
&
contextForces
,
bool
&
valid
,
int
&
numTiles
)
:
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
double
&
energy
,
long
long
&
completionTime
,
long
long
*
pinnedMemory
,
CudaArray
&
contextForces
,
bool
&
valid
,
int
2
&
interactionCount
)
:
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
energy
(
energy
),
completionTime
(
completionTime
),
pinnedMemory
(
pinnedMemory
),
contextForces
(
contextForces
),
valid
(
valid
),
numTiles
(
numTiles
)
{
completionTime
(
completionTime
),
pinnedMemory
(
pinnedMemory
),
contextForces
(
contextForces
),
valid
(
valid
),
interactionCount
(
interactionCount
)
{
}
void
execute
()
{
// Execute the kernel, then download forces.
...
...
@@ -120,7 +120,8 @@ public:
cu
.
getForce
().
download
(
&
pinnedMemory
[(
cu
.
getContextIndex
()
-
1
)
*
numAtoms
*
3
]);
}
}
if
(
cu
.
getNonbondedUtilities
().
getUsePeriodic
()
&&
numTiles
>
cu
.
getNonbondedUtilities
().
getInteractingTiles
().
getSize
())
{
if
(
cu
.
getNonbondedUtilities
().
getUsePeriodic
()
&&
(
interactionCount
.
x
>
cu
.
getNonbondedUtilities
().
getInteractingTiles
().
getSize
()
||
interactionCount
.
y
>
cu
.
getNonbondedUtilities
().
getSinglePairs
().
getSize
()))
{
valid
=
false
;
cu
.
getNonbondedUtilities
().
updateNeighborListSize
();
}
...
...
@@ -136,12 +137,12 @@ private:
long
long
*
pinnedMemory
;
CudaArray
&
contextForces
;
bool
&
valid
;
int
&
numTiles
;
int
2
&
interactionCount
;
};
CudaParallelCalcForcesAndEnergyKernel
::
CudaParallelCalcForcesAndEnergyKernel
(
string
name
,
const
Platform
&
platform
,
CudaPlatform
::
PlatformData
&
data
)
:
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
),
completionTimes
(
data
.
contexts
.
size
()),
contextNonbondedFractions
(
data
.
contexts
.
size
()),
tile
Counts
(
NULL
),
contextForces
(
NULL
),
pinnedPositionBuffer
(
NULL
),
pinnedForceBuffer
(
NULL
)
{
interaction
Counts
(
NULL
),
contextForces
(
NULL
),
pinnedPositionBuffer
(
NULL
),
pinnedForceBuffer
(
NULL
)
{
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
kernels
.
push_back
(
Kernel
(
new
CudaCalcForcesAndEnergyKernel
(
name
,
platform
,
*
data
.
contexts
[
i
])));
}
...
...
@@ -156,8 +157,8 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
cuMemFreeHost
(
pinnedForceBuffer
);
cuEventDestroy
(
event
);
cuStreamDestroy
(
peerCopyStream
);
if
(
tile
Counts
!=
NULL
)
cuMemFreeHost
(
tile
Counts
);
if
(
interaction
Counts
!=
NULL
)
cuMemFreeHost
(
interaction
Counts
);
}
void
CudaParallelCalcForcesAndEnergyKernel
::
initialize
(
const
System
&
system
)
{
...
...
@@ -172,7 +173,7 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
contextNonbondedFractions
[
i
]
=
1
/
(
double
)
numContexts
;
CHECK_RESULT
(
cuEventCreate
(
&
event
,
0
),
"Error creating event"
);
CHECK_RESULT
(
cuStreamCreate
(
&
peerCopyStream
,
CU_STREAM_NON_BLOCKING
),
"Error creating stream"
);
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
tile
Counts
,
numContexts
*
sizeof
(
int
),
0
),
"Error creating
tile
count buffer"
);
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
interaction
Counts
,
numContexts
*
sizeof
(
int
2
),
0
),
"Error creating
interaction
count
s
buffer"
);
}
void
CudaParallelCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
...
...
@@ -202,7 +203,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
data
.
contextEnergy
[
i
]
=
0.0
;
CudaContext
&
cu
=
*
data
.
contexts
[
i
];
CudaContext
::
WorkThread
&
thread
=
cu
.
getWorkThread
();
thread
.
addTask
(
new
BeginComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
pinnedPositionBuffer
,
event
,
tile
Counts
[
i
]));
thread
.
addTask
(
new
BeginComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
pinnedPositionBuffer
,
event
,
interaction
Counts
[
i
]));
}
}
...
...
@@ -210,7 +211,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
{
CudaContext
&
cu
=
*
data
.
contexts
[
i
];
CudaContext
::
WorkThread
&
thread
=
cu
.
getWorkThread
();
thread
.
addTask
(
new
FinishComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
data
.
contextEnergy
[
i
],
completionTimes
[
i
],
pinnedForceBuffer
,
*
contextForces
,
valid
,
tile
Counts
[
i
]));
thread
.
addTask
(
new
FinishComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
data
.
contextEnergy
[
i
],
completionTimes
[
i
],
pinnedForceBuffer
,
*
contextForces
,
valid
,
interaction
Counts
[
i
]));
}
data
.
syncContexts
();
double
energy
=
0.0
;
...
...
platforms/cuda/src/kernels/findInteractingBlocks.cu
View file @
388bf972
...
...
@@ -71,9 +71,60 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
if
(
rebuild
)
{
rebuildNeighborList
[
0
]
=
1
;
interactionCount
[
0
]
=
0
;
interactionCount
[
1
]
=
0
;
}
}
__device__
int
saveSinglePairs
(
int
x
,
int
*
atoms
,
int
*
flags
,
int
length
,
unsigned
int
maxSinglePairs
,
unsigned
int
*
singlePairCount
,
int2
*
singlePairs
,
int
*
sumBuffer
,
volatile
int
&
pairStartIndex
)
{
// Record interactions that should be computed as single pairs rather than in blocks.
const
int
indexInWarp
=
threadIdx
.
x
%
32
;
int
sum
=
0
;
for
(
int
i
=
indexInWarp
;
i
<
length
;
i
+=
32
)
{
int
count
=
__popc
(
flags
[
i
]);
sum
+=
(
count
<=
MAX_BITS_FOR_PAIRS
?
count
:
0
);
}
sumBuffer
[
indexInWarp
]
=
sum
;
for
(
int
step
=
1
;
step
<
32
;
step
*=
2
)
{
int
add
=
(
indexInWarp
>=
step
?
sumBuffer
[
indexInWarp
-
step
]
:
0
);
sumBuffer
[
indexInWarp
]
+=
add
;
}
int
pairsToStore
=
sumBuffer
[
31
];
if
(
indexInWarp
==
0
)
pairStartIndex
=
atomicAdd
(
singlePairCount
,
pairsToStore
);
int
pairIndex
=
pairStartIndex
+
(
indexInWarp
>
0
?
sumBuffer
[
indexInWarp
-
1
]
:
0
);
for
(
int
i
=
indexInWarp
;
i
<
length
;
i
+=
32
)
{
int
count
=
__popc
(
flags
[
i
]);
if
(
count
<=
MAX_BITS_FOR_PAIRS
&&
pairIndex
+
count
<
maxSinglePairs
)
{
int
f
=
flags
[
i
];
while
(
f
!=
0
)
{
singlePairs
[
pairIndex
]
=
make_int2
(
atoms
[
i
],
x
*
TILE_SIZE
+
__ffs
(
f
)
-
1
);
f
&=
f
-
1
;
pairIndex
++
;
}
}
}
// Compact the remaining interactions.
const
int
warpMask
=
(
1
<<
indexInWarp
)
-
1
;
int
numCompacted
=
0
;
for
(
int
start
=
0
;
start
<
length
;
start
+=
32
)
{
int
i
=
start
+
indexInWarp
;
int
atom
=
atoms
[
i
];
int
flag
=
flags
[
i
];
bool
include
=
(
i
<
length
&&
__popc
(
flags
[
i
])
>
MAX_BITS_FOR_PAIRS
);
int
includeFlags
=
__ballot
(
include
);
if
(
include
)
{
int
index
=
numCompacted
+
__popc
(
includeFlags
&
warpMask
);
atoms
[
index
]
=
atom
;
flags
[
index
]
=
flag
;
}
numCompacted
+=
__popc
(
includeFlags
);
}
return
numCompacted
;
}
/**
* Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two
* atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
...
...
@@ -124,8 +175,9 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
*
*/
extern
"C"
__global__
void
findBlocksWithInteractions
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
*
__restrict__
interactionCount
,
int
*
__restrict__
interactingTiles
,
unsigned
int
*
__restrict__
interactingAtoms
,
const
real4
*
__restrict__
posq
,
unsigned
int
maxTiles
,
unsigned
int
startBlockIndex
,
unsigned
int
numBlocks
,
real2
*
__restrict__
sortedBlocks
,
const
real4
*
__restrict__
sortedBlockCenter
,
unsigned
int
*
__restrict__
interactionCount
,
int
*
__restrict__
interactingTiles
,
unsigned
int
*
__restrict__
interactingAtoms
,
int2
*
__restrict__
singlePairs
,
const
real4
*
__restrict__
posq
,
unsigned
int
maxTiles
,
unsigned
int
maxSinglePairs
,
unsigned
int
startBlockIndex
,
unsigned
int
numBlocks
,
real2
*
__restrict__
sortedBlocks
,
const
real4
*
__restrict__
sortedBlockCenter
,
const
real4
*
__restrict__
sortedBlockBoundingBox
,
const
unsigned
int
*
__restrict__
exclusionIndices
,
const
unsigned
int
*
__restrict__
exclusionRowIndices
,
real4
*
__restrict__
oldPositions
,
const
int
*
__restrict__
rebuildNeighborList
)
{
...
...
@@ -138,12 +190,17 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
const
int
warpIndex
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
32
;
const
int
warpMask
=
(
1
<<
indexInWarp
)
-
1
;
__shared__
int
workgroupBuffer
[
BUFFER_SIZE
*
(
GROUP_SIZE
/
32
)];
__shared__
int
workgroupFlagsBuffer
[
BUFFER_SIZE
*
(
GROUP_SIZE
/
32
)];
__shared__
int
warpExclusions
[
MAX_EXCLUSIONS
*
(
GROUP_SIZE
/
32
)];
__shared__
real3
posBuffer
[
GROUP_SIZE
];
__shared__
volatile
int
workgroupTileIndex
[
GROUP_SIZE
/
32
];
__shared__
int
sumBuffer
[
GROUP_SIZE
];
__shared__
int
worksgroupPairStartIndex
[
GROUP_SIZE
/
32
];
int
*
buffer
=
workgroupBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
flagsBuffer
=
workgroupFlagsBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
exclusionsForX
=
warpExclusions
+
MAX_EXCLUSIONS
*
(
warpStart
/
32
);
volatile
int
&
tileStartIndex
=
workgroupTileIndex
[
warpStart
/
32
];
volatile
int
&
pairStartIndex
=
worksgroupPairStartIndex
[
warpStart
/
32
];
// Loop over blocks.
...
...
@@ -227,7 +284,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
APPLY_PERIODIC_TO_DELTA
(
atomDelta
)
#endif
int
atomFlags
=
ballot
(
atomDelta
.
x
*
atomDelta
.
x
+
atomDelta
.
y
*
atomDelta
.
y
+
atomDelta
.
z
*
atomDelta
.
z
<
(
PADDED_CUTOFF
+
blockCenterY
.
w
)
*
(
PADDED_CUTOFF
+
blockCenterY
.
w
));
bool
interacts
=
false
;
int
interacts
=
0
;
if
(
atom2
<
NUM_ATOMS
&&
atomFlags
!=
0
)
{
int
first
=
__ffs
(
atomFlags
)
-
1
;
int
last
=
32
-
__clz
(
atomFlags
);
...
...
@@ -236,14 +293,14 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
for
(
int
j
=
first
;
j
<
last
;
j
++
)
{
real3
delta
=
pos2
-
posBuffer
[
warpStart
+
j
];
APPLY_PERIODIC_TO_DELTA
(
delta
)
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
);
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
?
1
<<
j
:
0
);
}
}
else
{
#endif
for
(
int
j
=
first
;
j
<
last
;
j
++
)
{
real3
delta
=
pos2
-
posBuffer
[
warpStart
+
j
];
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
);
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
?
1
<<
j
:
0
);
}
#ifdef USE_PERIODIC
}
...
...
@@ -253,15 +310,22 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
// Add any interacting atoms to the buffer.
int
includeAtomFlags
=
__ballot
(
interacts
);
if
(
interacts
)
buffer
[
neighborsInBuffer
+
__popc
(
includeAtomFlags
&
warpMask
)]
=
atom2
;
if
(
interacts
)
{
int
index
=
neighborsInBuffer
+
__popc
(
includeAtomFlags
&
warpMask
);
buffer
[
index
]
=
atom2
;
flagsBuffer
[
index
]
=
interacts
;
}
neighborsInBuffer
+=
__popc
(
includeAtomFlags
);
if
(
neighborsInBuffer
>
BUFFER_SIZE
-
TILE_SIZE
)
{
// Store the new tiles to memory.
#if MAX_BITS_FOR_PAIRS > 0
neighborsInBuffer
=
saveSinglePairs
(
x
,
buffer
,
flagsBuffer
,
neighborsInBuffer
,
maxSinglePairs
,
&
interactionCount
[
1
],
singlePairs
,
sumBuffer
+
warpStart
,
pairStartIndex
);
#endif
int
tilesToStore
=
neighborsInBuffer
/
TILE_SIZE
;
if
(
tilesToStore
>
0
)
{
if
(
indexInWarp
==
0
)
tileStartIndex
=
atomicAdd
(
interactionCount
,
tilesToStore
);
tileStartIndex
=
atomicAdd
(
&
interactionCount
[
0
]
,
tilesToStore
);
int
newTileStartIndex
=
tileStartIndex
;
if
(
newTileStartIndex
+
tilesToStore
<=
maxTiles
)
{
if
(
indexInWarp
<
tilesToStore
)
...
...
@@ -274,13 +338,18 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
}
}
}
}
// If we have a partially filled buffer, store it to memory.
#if MAX_BITS_FOR_PAIRS > 0
if
(
neighborsInBuffer
>
32
)
neighborsInBuffer
=
saveSinglePairs
(
x
,
buffer
,
flagsBuffer
,
neighborsInBuffer
,
maxSinglePairs
,
&
interactionCount
[
1
],
singlePairs
,
sumBuffer
+
warpStart
,
pairStartIndex
);
#endif
if
(
neighborsInBuffer
>
0
)
{
int
tilesToStore
=
(
neighborsInBuffer
+
TILE_SIZE
-
1
)
/
TILE_SIZE
;
if
(
indexInWarp
==
0
)
tileStartIndex
=
atomicAdd
(
interactionCount
,
tilesToStore
);
tileStartIndex
=
atomicAdd
(
&
interactionCount
[
0
]
,
tilesToStore
);
int
newTileStartIndex
=
tileStartIndex
;
if
(
newTileStartIndex
+
tilesToStore
<=
maxTiles
)
{
if
(
indexInWarp
<
tilesToStore
)
...
...
platforms/cuda/src/kernels/nonbonded.cu
View file @
388bf972
...
...
@@ -103,9 +103,10 @@ extern "C" __global__ void computeNonbonded(
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
tileflags
*
__restrict__
exclusions
,
const
ushort2
*
__restrict__
exclusionTiles
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
#ifdef USE_CUTOFF
,
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
,
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
,
unsigned
int
maxSinglePairs
,
const
int2
*
__restrict__
singlePairs
#endif
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
...
...
@@ -588,6 +589,59 @@ extern "C" __global__ void computeNonbonded(
}
pos
++
;
}
// Third loop: single pairs that aren't part of a tile.
#if USE_CUTOFF
const
unsigned
int
numPairs
=
interactionCount
[
1
];
if
(
numPairs
>
maxSinglePairs
)
return
;
// There wasn't enough memory for the neighbor list.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
numPairs
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int2
pair
=
singlePairs
[
i
];
int
atom1
=
pair
.
x
;
int
atom2
=
pair
.
y
;
real4
posq1
=
posq
[
atom1
];
real4
posq2
=
posq
[
atom2
];
LOAD_ATOM1_PARAMETERS
int
j
=
atom2
;
atom2
=
threadIdx
.
x
;
DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_ATOM2_PARAMETERS
atom2
=
pair
.
y
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
#ifdef USE_SYMMETRIC
real
dEdR
=
0.0
f
;
#else
real3
dEdR1
=
make_real3
(
0
);
real3
dEdR2
=
make_real3
(
0
);
#endif
bool
hasExclusions
=
false
;
bool
isExcluded
=
false
;
real
tempEnergy
=
0.0
f
;
const
real
interactionScale
=
1.0
f
;
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
real3
dEdR1
=
delta
*
dEdR
;
real3
dEdR2
=
-
dEdR1
;
#endif
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR1
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR1
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR1
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR2
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR2
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR2
.
z
*
0x100000000
)));
#endif
}
#endif
#ifdef INCLUDE_ENERGY
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
#endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment