Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
5fa4345f
Commit
5fa4345f
authored
Sep 14, 2016
by
Peter Eastman
Browse files
Beginning of implementing fine grained pair list
parent
10b51d25
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
211 additions
and
29 deletions
+211
-29
platforms/cuda/include/CudaNonbondedUtilities.h
platforms/cuda/include/CudaNonbondedUtilities.h
+3
-1
platforms/cuda/src/CudaNonbondedUtilities.cpp
platforms/cuda/src/CudaNonbondedUtilities.cpp
+46
-19
platforms/cuda/src/kernels/findInteractingBlocks.cu
platforms/cuda/src/kernels/findInteractingBlocks.cu
+107
-8
platforms/cuda/src/kernels/nonbonded.cu
platforms/cuda/src/kernels/nonbonded.cu
+55
-1
No files found.
platforms/cuda/include/CudaNonbondedUtilities.h
View file @
5fa4345f
...
@@ -270,6 +270,8 @@ private:
...
@@ -270,6 +270,8 @@ private:
CudaArray
*
interactingTiles
;
CudaArray
*
interactingTiles
;
CudaArray
*
interactingAtoms
;
CudaArray
*
interactingAtoms
;
CudaArray
*
interactionCount
;
CudaArray
*
interactionCount
;
CudaArray
*
singlePairs
;
CudaArray
*
singlePairCount
;
CudaArray
*
blockCenter
;
CudaArray
*
blockCenter
;
CudaArray
*
blockBoundingBox
;
CudaArray
*
blockBoundingBox
;
CudaArray
*
sortedBlocks
;
CudaArray
*
sortedBlocks
;
...
@@ -289,7 +291,7 @@ private:
...
@@ -289,7 +291,7 @@ private:
std
::
map
<
int
,
std
::
string
>
groupKernelSource
;
std
::
map
<
int
,
std
::
string
>
groupKernelSource
;
double
lastCutoff
;
double
lastCutoff
;
bool
useCutoff
,
usePeriodic
,
anyExclusions
,
usePadding
,
forceRebuildNeighborList
;
bool
useCutoff
,
usePeriodic
,
anyExclusions
,
usePadding
,
forceRebuildNeighborList
;
int
startTileIndex
,
numTiles
,
startBlockIndex
,
numBlocks
,
maxTiles
,
maxExclusions
,
numForceThreadBlocks
,
forceThreadBlockSize
,
numAtoms
,
groupFlags
;
int
startTileIndex
,
numTiles
,
startBlockIndex
,
numBlocks
,
maxTiles
,
maxSinglePairs
,
maxExclusions
,
numForceThreadBlocks
,
forceThreadBlockSize
,
numAtoms
,
groupFlags
;
};
};
/**
/**
...
...
platforms/cuda/src/CudaNonbondedUtilities.cpp
View file @
5fa4345f
...
@@ -64,7 +64,7 @@ private:
...
@@ -64,7 +64,7 @@ private:
CudaNonbondedUtilities
::
CudaNonbondedUtilities
(
CudaContext
&
context
)
:
context
(
context
),
useCutoff
(
false
),
usePeriodic
(
false
),
anyExclusions
(
false
),
usePadding
(
true
),
CudaNonbondedUtilities
::
CudaNonbondedUtilities
(
CudaContext
&
context
)
:
context
(
context
),
useCutoff
(
false
),
usePeriodic
(
false
),
anyExclusions
(
false
),
usePadding
(
true
),
exclusionIndices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusionTiles
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactingAtoms
(
NULL
),
exclusionIndices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusionTiles
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactingAtoms
(
NULL
),
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
sortedBlocks
(
NULL
),
sortedBlockCenter
(
NULL
),
sortedBlockBoundingBox
(
NULL
),
interactionCount
(
NULL
),
singlePairs
(
NULL
),
singlePairCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
sortedBlocks
(
NULL
),
sortedBlockCenter
(
NULL
),
sortedBlockBoundingBox
(
NULL
),
oldPositions
(
NULL
),
rebuildNeighborList
(
NULL
),
blockSorter
(
NULL
),
pinnedCountBuffer
(
NULL
),
forceRebuildNeighborList
(
true
),
lastCutoff
(
0.0
),
groupFlags
(
0
)
{
oldPositions
(
NULL
),
rebuildNeighborList
(
NULL
),
blockSorter
(
NULL
),
pinnedCountBuffer
(
NULL
),
forceRebuildNeighborList
(
true
),
lastCutoff
(
0.0
),
groupFlags
(
0
)
{
// Decide how many thread blocks to use.
// Decide how many thread blocks to use.
...
@@ -72,7 +72,7 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c
...
@@ -72,7 +72,7 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c
int
multiprocessors
;
int
multiprocessors
;
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
context
.
getDevice
()));
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
context
.
getDevice
()));
CHECK_RESULT
(
cuEventCreate
(
&
downloadCountEvent
,
0
));
CHECK_RESULT
(
cuEventCreate
(
&
downloadCountEvent
,
0
));
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
pinnedCountBuffer
,
sizeof
(
int
),
CU_MEMHOSTALLOC_PORTABLE
));
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
pinnedCountBuffer
,
2
*
sizeof
(
int
),
CU_MEMHOSTALLOC_PORTABLE
));
numForceThreadBlocks
=
4
*
multiprocessors
;
numForceThreadBlocks
=
4
*
multiprocessors
;
forceThreadBlockSize
=
(
context
.
getComputeCapability
()
<
2.0
?
128
:
256
);
forceThreadBlockSize
=
(
context
.
getComputeCapability
()
<
2.0
?
128
:
256
);
}
}
...
@@ -92,6 +92,10 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
...
@@ -92,6 +92,10 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
delete
interactingAtoms
;
delete
interactingAtoms
;
if
(
interactionCount
!=
NULL
)
if
(
interactionCount
!=
NULL
)
delete
interactionCount
;
delete
interactionCount
;
if
(
singlePairs
!=
NULL
)
delete
singlePairs
;
if
(
singlePairCount
!=
NULL
)
delete
singlePairCount
;
if
(
blockCenter
!=
NULL
)
if
(
blockCenter
!=
NULL
)
delete
blockCenter
;
delete
blockCenter
;
if
(
blockBoundingBox
!=
NULL
)
if
(
blockBoundingBox
!=
NULL
)
...
@@ -279,9 +283,12 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -279,9 +283,12 @@ void CudaNonbondedUtilities::initialize(const System& system) {
maxTiles
=
numTiles
;
maxTiles
=
numTiles
;
if
(
maxTiles
<
1
)
if
(
maxTiles
<
1
)
maxTiles
=
1
;
maxTiles
=
1
;
maxSinglePairs
=
5
*
numAtoms
;
interactingTiles
=
CudaArray
::
create
<
int
>
(
context
,
maxTiles
,
"interactingTiles"
);
interactingTiles
=
CudaArray
::
create
<
int
>
(
context
,
maxTiles
,
"interactingTiles"
);
interactingAtoms
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interactingAtoms"
);
interactingAtoms
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interactingAtoms"
);
interactionCount
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
1
,
"interactionCount"
);
interactionCount
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
1
,
"interactionCount"
);
singlePairs
=
CudaArray
::
create
<
int2
>
(
context
,
maxSinglePairs
,
"singlePairs"
);
singlePairCount
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
1
,
"singlePairCount"
);
int
elementSize
=
(
context
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
int
elementSize
=
(
context
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
blockCenter
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockCenter"
);
blockCenter
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockCenter"
);
blockBoundingBox
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockBoundingBox"
);
blockBoundingBox
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockBoundingBox"
);
...
@@ -293,6 +300,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -293,6 +300,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
blockSorter
=
new
CudaSort
(
context
,
new
BlockSortTrait
(
context
.
getUseDoublePrecision
()),
numAtomBlocks
);
blockSorter
=
new
CudaSort
(
context
,
new
BlockSortTrait
(
context
.
getUseDoublePrecision
()),
numAtomBlocks
);
vector
<
unsigned
int
>
count
(
1
,
0
);
vector
<
unsigned
int
>
count
(
1
,
0
);
interactionCount
->
upload
(
count
);
interactionCount
->
upload
(
count
);
singlePairCount
->
upload
(
count
);
}
}
// Record arguments for kernels.
// Record arguments for kernels.
...
@@ -316,6 +324,9 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -316,6 +324,9 @@ void CudaNonbondedUtilities::initialize(const System& system) {
forceArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
forceArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
forceArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
forceArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
forceArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
forceArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
forceArgs
.
push_back
(
&
maxSinglePairs
);
forceArgs
.
push_back
(
&
singlePairCount
->
getDevicePointer
());
forceArgs
.
push_back
(
&
singlePairs
->
getDevicePointer
());
}
}
for
(
int
i
=
0
;
i
<
(
int
)
parameters
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
parameters
.
size
();
i
++
)
forceArgs
.
push_back
(
&
parameters
[
i
].
getMemory
());
forceArgs
.
push_back
(
&
parameters
[
i
].
getMemory
());
...
@@ -343,6 +354,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -343,6 +354,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
sortBoxDataArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
oldPositions
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
oldPositions
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
singlePairCount
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
rebuildNeighborList
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
rebuildNeighborList
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
forceRebuildNeighborList
);
sortBoxDataArgs
.
push_back
(
&
forceRebuildNeighborList
);
findInteractingBlocksArgs
.
push_back
(
context
.
getPeriodicBoxSizePointer
());
findInteractingBlocksArgs
.
push_back
(
context
.
getPeriodicBoxSizePointer
());
...
@@ -353,8 +365,11 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -353,8 +365,11 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingTiles
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingTiles
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
singlePairCount
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
singlePairs
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
maxTiles
);
findInteractingBlocksArgs
.
push_back
(
&
maxTiles
);
findInteractingBlocksArgs
.
push_back
(
&
maxSinglePairs
);
findInteractingBlocksArgs
.
push_back
(
&
startBlockIndex
);
findInteractingBlocksArgs
.
push_back
(
&
startBlockIndex
);
findInteractingBlocksArgs
.
push_back
(
&
numBlocks
);
findInteractingBlocksArgs
.
push_back
(
&
numBlocks
);
findInteractingBlocksArgs
.
push_back
(
&
sortedBlocks
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
sortedBlocks
->
getDevicePointer
());
...
@@ -402,6 +417,7 @@ void CudaNonbondedUtilities::prepareInteractions(int forceGroups) {
...
@@ -402,6 +417,7 @@ void CudaNonbondedUtilities::prepareInteractions(int forceGroups) {
forceRebuildNeighborList
=
false
;
forceRebuildNeighborList
=
false
;
lastCutoff
=
kernels
.
cutoffDistance
;
lastCutoff
=
kernels
.
cutoffDistance
;
interactionCount
->
download
(
pinnedCountBuffer
,
false
);
interactionCount
->
download
(
pinnedCountBuffer
,
false
);
singlePairCount
->
download
(
pinnedCountBuffer
+
1
,
false
);
cuEventRecord
(
downloadCountEvent
,
context
.
getCurrentStream
());
cuEventRecord
(
downloadCountEvent
,
context
.
getCurrentStream
());
}
}
...
@@ -424,28 +440,39 @@ void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeFo
...
@@ -424,28 +440,39 @@ void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeFo
bool
CudaNonbondedUtilities
::
updateNeighborListSize
()
{
bool
CudaNonbondedUtilities
::
updateNeighborListSize
()
{
if
(
!
useCutoff
)
if
(
!
useCutoff
)
return
false
;
return
false
;
if
(
pinnedCountBuffer
[
0
]
<=
(
unsigned
int
)
maxTile
s
)
if
(
pinnedCountBuffer
[
0
]
<=
maxTiles
&&
pinnedCountBuffer
[
1
]
<=
maxSinglePair
s
)
return
false
;
return
false
;
// The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent
// The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent
// this from happening in the future.
// this from happening in the future.
maxTiles
=
(
int
)
(
1.2
*
pinnedCountBuffer
[
0
]);
if
(
pinnedCountBuffer
[
0
]
>
maxTiles
)
{
int
totalTiles
=
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
;
maxTiles
=
(
int
)
(
1.2
*
pinnedCountBuffer
[
0
]);
if
(
maxTiles
>
totalTiles
)
int
totalTiles
=
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
;
maxTiles
=
totalTiles
;
if
(
maxTiles
>
totalTiles
)
delete
interactingTiles
;
maxTiles
=
totalTiles
;
delete
interactingAtoms
;
delete
interactingTiles
;
interactingTiles
=
NULL
;
// Avoid an error in the destructor if the following allocation fails
delete
interactingAtoms
;
interactingAtoms
=
NULL
;
interactingTiles
=
NULL
;
// Avoid an error in the destructor if the following allocation fails
interactingTiles
=
CudaArray
::
create
<
int
>
(
context
,
maxTiles
,
"interactingTiles"
);
interactingAtoms
=
NULL
;
interactingAtoms
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interactingAtoms"
);
interactingTiles
=
CudaArray
::
create
<
int
>
(
context
,
maxTiles
,
"interactingTiles"
);
if
(
forceArgs
.
size
()
>
0
)
interactingAtoms
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interactingAtoms"
);
forceArgs
[
7
]
=
&
interactingTiles
->
getDevicePointer
();
if
(
forceArgs
.
size
()
>
0
)
findInteractingBlocksArgs
[
6
]
=
&
interactingTiles
->
getDevicePointer
();
forceArgs
[
7
]
=
&
interactingTiles
->
getDevicePointer
();
if
(
forceArgs
.
size
()
>
0
)
findInteractingBlocksArgs
[
6
]
=
&
interactingTiles
->
getDevicePointer
();
forceArgs
[
17
]
=
&
interactingAtoms
->
getDevicePointer
();
if
(
forceArgs
.
size
()
>
0
)
findInteractingBlocksArgs
[
7
]
=
&
interactingAtoms
->
getDevicePointer
();
forceArgs
[
17
]
=
&
interactingAtoms
->
getDevicePointer
();
findInteractingBlocksArgs
[
7
]
=
&
interactingAtoms
->
getDevicePointer
();
}
if
(
pinnedCountBuffer
[
1
]
>
maxSinglePairs
)
{
maxSinglePairs
=
(
int
)
(
1.2
*
pinnedCountBuffer
[
1
]);
delete
singlePairs
;
singlePairs
=
NULL
;
// Avoid an error in the destructor if the following allocation fails
singlePairs
=
CudaArray
::
create
<
int2
>
(
context
,
maxSinglePairs
,
"singlePairs"
);
if
(
forceArgs
.
size
()
>
0
)
forceArgs
[
20
]
=
&
singlePairs
->
getDevicePointer
();
findInteractingBlocksArgs
[
9
]
=
&
singlePairs
->
getDevicePointer
();
}
forceRebuildNeighborList
=
true
;
forceRebuildNeighborList
=
true
;
context
.
setForcesValid
(
false
);
context
.
setForcesValid
(
false
);
return
true
;
return
true
;
...
...
platforms/cuda/src/kernels/findInteractingBlocks.cu
View file @
5fa4345f
...
@@ -53,7 +53,7 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
...
@@ -53,7 +53,7 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
extern
"C"
__global__
void
sortBoxData
(
const
real2
*
__restrict__
sortedBlock
,
const
real4
*
__restrict__
blockCenter
,
extern
"C"
__global__
void
sortBoxData
(
const
real2
*
__restrict__
sortedBlock
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockBoundingBox
,
real4
*
__restrict__
sortedBlockCenter
,
const
real4
*
__restrict__
blockBoundingBox
,
real4
*
__restrict__
sortedBlockCenter
,
real4
*
__restrict__
sortedBlockBoundingBox
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
oldPositions
,
real4
*
__restrict__
sortedBlockBoundingBox
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
oldPositions
,
unsigned
int
*
__restrict__
interactionCount
,
int
*
__restrict__
rebuildNeighborList
,
bool
forceRebuild
)
{
unsigned
int
*
__restrict__
interactionCount
,
unsigned
int
*
__restrict__
singlePairCount
,
int
*
__restrict__
rebuildNeighborList
,
bool
forceRebuild
)
{
for
(
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
NUM_BLOCKS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
NUM_BLOCKS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
index
=
(
int
)
sortedBlock
[
i
].
y
;
int
index
=
(
int
)
sortedBlock
[
i
].
y
;
sortedBlockCenter
[
i
]
=
blockCenter
[
index
];
sortedBlockCenter
[
i
]
=
blockCenter
[
index
];
...
@@ -71,9 +71,96 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
...
@@ -71,9 +71,96 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
if
(
rebuild
)
{
if
(
rebuild
)
{
rebuildNeighborList
[
0
]
=
1
;
rebuildNeighborList
[
0
]
=
1
;
interactionCount
[
0
]
=
0
;
interactionCount
[
0
]
=
0
;
singlePairCount
[
0
]
=
0
;
}
}
}
}
__device__
int
sortBlockAtoms
(
int
x
,
int
*
atoms
,
int
*
flags
,
int
length
,
unsigned
int
maxSinglePairs
,
unsigned
int
*
singlePairCount
,
int2
*
singlePairs
,
int
*
sumBuffer
,
volatile
int
&
pairStartIndex
)
{
const
int
indexInWarp
=
threadIdx
.
x
%
32
;
const
int
maxBitsForPairs
=
2
;
int
sum
=
0
;
for
(
int
i
=
indexInWarp
;
i
<
length
;
i
+=
32
)
{
int
count
=
__popc
(
flags
[
i
]);
sum
+=
(
count
<=
maxBitsForPairs
?
count
:
0
);
}
sumBuffer
[
indexInWarp
]
=
sum
;
for
(
int
step
=
1
;
step
<
32
;
step
*=
2
)
{
int
add
=
(
indexInWarp
>=
step
?
sumBuffer
[
indexInWarp
-
step
]
:
0
);
sumBuffer
[
indexInWarp
]
+=
add
;
}
int
pairsToStore
=
sumBuffer
[
31
];
if
(
indexInWarp
==
0
)
pairStartIndex
=
atomicAdd
(
singlePairCount
,
pairsToStore
);
int
pairIndex
=
pairStartIndex
+
(
indexInWarp
>
0
?
sumBuffer
[
indexInWarp
-
1
]
:
0
);
for
(
int
i
=
indexInWarp
;
i
<
length
;
i
+=
32
)
{
int
count
=
__popc
(
flags
[
i
]);
if
(
count
<=
maxBitsForPairs
&&
pairIndex
+
count
<
maxSinglePairs
)
{
int
f
=
flags
[
i
];
while
(
f
!=
0
)
{
singlePairs
[
pairIndex
]
=
make_int2
(
atoms
[
i
],
x
*
TILE_SIZE
+
__ffs
(
f
)
-
1
);
f
&=
f
-
1
;
pairIndex
++
;
}
}
}
// sum = 0;
// for (int i = indexInWarp; i < length; i += 32) {
// int count = __popc(flags[i]);
// sum += (count <= maxBitsForPairs ? 1 : 0);
// }
// sumBuffer[indexInWarp] = sum;
// for (int step = 1; step < 32; step *= 2) {
// int add = (indexInWarp >= step ? sumBuffer[indexInWarp-step] : 0);
// sumBuffer[indexInWarp] += add;
// }
//
//
// for (unsigned int k = 2; k < 2*length; k *= 2) {
// for (unsigned int j = k/2; j > 0; j /= 2) {
// for (unsigned int i = indexInWarp; i < length; i += 32) {
// int ixj = i^j;
// if (ixj > i && ixj < length) {
// int key1 = __popc(flags[i]);
// int key2 = __popc(flags[ixj]);
// bool ascending = ((i&k) == 0);
// for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
// ascending = ((i&mask) == 0 ? !ascending : ascending);
// int lowKey = (ascending ? key1 : key2);
// int highKey = (ascending ? key2 : key1);
// if (lowKey < highKey) {
// int tempAtom = atoms[i];
// int tempFlags = flags[i];
// atoms[i] = atoms[ixj];
// flags[i] = flags[ixj];
// atoms[ixj] = tempAtom;
// flags[ixj] = tempFlags;
// }
// }
// }
// }
// }
// return length-sumBuffer[31];
const
int
warpMask
=
(
1
<<
indexInWarp
)
-
1
;
int
numCompacted
=
0
;
for
(
int
i
=
indexInWarp
;
i
<
BUFFER_SIZE
;
i
+=
32
)
{
int
atom
=
atoms
[
i
];
int
flag
=
flags
[
i
];
bool
include
=
(
i
<
length
&&
__popc
(
flags
[
i
])
>
maxBitsForPairs
);
int
includeFlags
=
__ballot
(
include
);
if
(
include
)
{
int
index
=
numCompacted
+
__popc
(
includeFlags
&
warpMask
);
atoms
[
index
]
=
atom
;
flags
[
index
]
=
flag
;
}
numCompacted
+=
__popc
(
includeFlags
);
}
return
numCompacted
;
}
/**
/**
* Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two
* Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two
* atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
* atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
...
@@ -124,8 +211,9 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
...
@@ -124,8 +211,9 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
*
*
*/
*/
extern
"C"
__global__
void
findBlocksWithInteractions
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
extern
"C"
__global__
void
findBlocksWithInteractions
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
*
__restrict__
interactionCount
,
int
*
__restrict__
interactingTiles
,
unsigned
int
*
__restrict__
interactingAtoms
,
const
real4
*
__restrict__
posq
,
unsigned
int
*
__restrict__
interactionCount
,
int
*
__restrict__
interactingTiles
,
unsigned
int
*
__restrict__
interactingAtoms
,
unsigned
int
maxTiles
,
unsigned
int
startBlockIndex
,
unsigned
int
numBlocks
,
real2
*
__restrict__
sortedBlocks
,
const
real4
*
__restrict__
sortedBlockCenter
,
unsigned
int
*
__restrict__
singlePairCount
,
int2
*
__restrict__
singlePairs
,
const
real4
*
__restrict__
posq
,
unsigned
int
maxTiles
,
unsigned
int
maxSinglePairs
,
unsigned
int
startBlockIndex
,
unsigned
int
numBlocks
,
real2
*
__restrict__
sortedBlocks
,
const
real4
*
__restrict__
sortedBlockCenter
,
const
real4
*
__restrict__
sortedBlockBoundingBox
,
const
unsigned
int
*
__restrict__
exclusionIndices
,
const
unsigned
int
*
__restrict__
exclusionRowIndices
,
const
real4
*
__restrict__
sortedBlockBoundingBox
,
const
unsigned
int
*
__restrict__
exclusionIndices
,
const
unsigned
int
*
__restrict__
exclusionRowIndices
,
real4
*
__restrict__
oldPositions
,
const
int
*
__restrict__
rebuildNeighborList
)
{
real4
*
__restrict__
oldPositions
,
const
int
*
__restrict__
rebuildNeighborList
)
{
...
@@ -138,12 +226,17 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
...
@@ -138,12 +226,17 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
const
int
warpIndex
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
32
;
const
int
warpIndex
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
32
;
const
int
warpMask
=
(
1
<<
indexInWarp
)
-
1
;
const
int
warpMask
=
(
1
<<
indexInWarp
)
-
1
;
__shared__
int
workgroupBuffer
[
BUFFER_SIZE
*
(
GROUP_SIZE
/
32
)];
__shared__
int
workgroupBuffer
[
BUFFER_SIZE
*
(
GROUP_SIZE
/
32
)];
__shared__
int
workgroupFlagsBuffer
[
BUFFER_SIZE
*
(
GROUP_SIZE
/
32
)];
__shared__
int
warpExclusions
[
MAX_EXCLUSIONS
*
(
GROUP_SIZE
/
32
)];
__shared__
int
warpExclusions
[
MAX_EXCLUSIONS
*
(
GROUP_SIZE
/
32
)];
__shared__
real3
posBuffer
[
GROUP_SIZE
];
__shared__
real3
posBuffer
[
GROUP_SIZE
];
__shared__
volatile
int
workgroupTileIndex
[
GROUP_SIZE
/
32
];
__shared__
volatile
int
workgroupTileIndex
[
GROUP_SIZE
/
32
];
__shared__
int
sumBuffer
[
GROUP_SIZE
];
__shared__
int
worksgroupPairStartIndex
[
GROUP_SIZE
/
32
];
int
*
buffer
=
workgroupBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
buffer
=
workgroupBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
flagsBuffer
=
workgroupFlagsBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
exclusionsForX
=
warpExclusions
+
MAX_EXCLUSIONS
*
(
warpStart
/
32
);
int
*
exclusionsForX
=
warpExclusions
+
MAX_EXCLUSIONS
*
(
warpStart
/
32
);
volatile
int
&
tileStartIndex
=
workgroupTileIndex
[
warpStart
/
32
];
volatile
int
&
tileStartIndex
=
workgroupTileIndex
[
warpStart
/
32
];
volatile
int
&
pairStartIndex
=
worksgroupPairStartIndex
[
warpStart
/
32
];
// Loop over blocks.
// Loop over blocks.
...
@@ -227,7 +320,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
...
@@ -227,7 +320,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
APPLY_PERIODIC_TO_DELTA
(
atomDelta
)
APPLY_PERIODIC_TO_DELTA
(
atomDelta
)
#endif
#endif
int
atomFlags
=
ballot
(
atomDelta
.
x
*
atomDelta
.
x
+
atomDelta
.
y
*
atomDelta
.
y
+
atomDelta
.
z
*
atomDelta
.
z
<
(
PADDED_CUTOFF
+
blockCenterY
.
w
)
*
(
PADDED_CUTOFF
+
blockCenterY
.
w
));
int
atomFlags
=
ballot
(
atomDelta
.
x
*
atomDelta
.
x
+
atomDelta
.
y
*
atomDelta
.
y
+
atomDelta
.
z
*
atomDelta
.
z
<
(
PADDED_CUTOFF
+
blockCenterY
.
w
)
*
(
PADDED_CUTOFF
+
blockCenterY
.
w
));
bool
interacts
=
false
;
int
interacts
=
0
;
if
(
atom2
<
NUM_ATOMS
&&
atomFlags
!=
0
)
{
if
(
atom2
<
NUM_ATOMS
&&
atomFlags
!=
0
)
{
int
first
=
__ffs
(
atomFlags
)
-
1
;
int
first
=
__ffs
(
atomFlags
)
-
1
;
int
last
=
32
-
__clz
(
atomFlags
);
int
last
=
32
-
__clz
(
atomFlags
);
...
@@ -236,14 +329,14 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
...
@@ -236,14 +329,14 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
for
(
int
j
=
first
;
j
<
last
;
j
++
)
{
for
(
int
j
=
first
;
j
<
last
;
j
++
)
{
real3
delta
=
pos2
-
posBuffer
[
warpStart
+
j
];
real3
delta
=
pos2
-
posBuffer
[
warpStart
+
j
];
APPLY_PERIODIC_TO_DELTA
(
delta
)
APPLY_PERIODIC_TO_DELTA
(
delta
)
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
);
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
?
1
<<
j
:
0
);
}
}
}
}
else
{
else
{
#endif
#endif
for
(
int
j
=
first
;
j
<
last
;
j
++
)
{
for
(
int
j
=
first
;
j
<
last
;
j
++
)
{
real3
delta
=
pos2
-
posBuffer
[
warpStart
+
j
];
real3
delta
=
pos2
-
posBuffer
[
warpStart
+
j
];
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
);
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
?
1
<<
j
:
0
);
}
}
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
}
}
...
@@ -253,12 +346,16 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
...
@@ -253,12 +346,16 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
// Add any interacting atoms to the buffer.
// Add any interacting atoms to the buffer.
int
includeAtomFlags
=
__ballot
(
interacts
);
int
includeAtomFlags
=
__ballot
(
interacts
);
if
(
interacts
)
if
(
interacts
)
{
buffer
[
neighborsInBuffer
+
__popc
(
includeAtomFlags
&
warpMask
)]
=
atom2
;
int
index
=
neighborsInBuffer
+
__popc
(
includeAtomFlags
&
warpMask
);
buffer
[
index
]
=
atom2
;
flagsBuffer
[
index
]
=
interacts
;
}
neighborsInBuffer
+=
__popc
(
includeAtomFlags
);
neighborsInBuffer
+=
__popc
(
includeAtomFlags
);
if
(
neighborsInBuffer
>
BUFFER_SIZE
-
TILE_SIZE
)
{
if
(
neighborsInBuffer
>
BUFFER_SIZE
-
TILE_SIZE
)
{
// Store the new tiles to memory.
// Store the new tiles to memory.
neighborsInBuffer
=
sortBlockAtoms
(
x
,
buffer
,
flagsBuffer
,
neighborsInBuffer
,
maxSinglePairs
,
singlePairCount
,
singlePairs
,
sumBuffer
+
warpStart
,
pairStartIndex
);
int
tilesToStore
=
neighborsInBuffer
/
TILE_SIZE
;
int
tilesToStore
=
neighborsInBuffer
/
TILE_SIZE
;
if
(
indexInWarp
==
0
)
if
(
indexInWarp
==
0
)
tileStartIndex
=
atomicAdd
(
interactionCount
,
tilesToStore
);
tileStartIndex
=
atomicAdd
(
interactionCount
,
tilesToStore
);
...
@@ -277,6 +374,8 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
...
@@ -277,6 +374,8 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
// If we have a partially filled buffer, store it to memory.
// If we have a partially filled buffer, store it to memory.
if
(
neighborsInBuffer
>
32
)
neighborsInBuffer
=
sortBlockAtoms
(
x
,
buffer
,
flagsBuffer
,
neighborsInBuffer
,
maxSinglePairs
,
singlePairCount
,
singlePairs
,
sumBuffer
+
warpStart
,
pairStartIndex
);
if
(
neighborsInBuffer
>
0
)
{
if
(
neighborsInBuffer
>
0
)
{
int
tilesToStore
=
(
neighborsInBuffer
+
TILE_SIZE
-
1
)
/
TILE_SIZE
;
int
tilesToStore
=
(
neighborsInBuffer
+
TILE_SIZE
-
1
)
/
TILE_SIZE
;
if
(
indexInWarp
==
0
)
if
(
indexInWarp
==
0
)
...
...
platforms/cuda/src/kernels/nonbonded.cu
View file @
5fa4345f
...
@@ -105,7 +105,8 @@ extern "C" __global__ void computeNonbonded(
...
@@ -105,7 +105,8 @@ extern "C" __global__ void computeNonbonded(
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
,
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
,
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
,
unsigned
int
maxSinglePairs
,
const
int
*
__restrict__
singlePairCount
,
const
int2
*
__restrict__
singlePairs
#endif
#endif
PARAMETER_ARGUMENTS
)
{
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
...
@@ -588,6 +589,59 @@ extern "C" __global__ void computeNonbonded(
...
@@ -588,6 +589,59 @@ extern "C" __global__ void computeNonbonded(
}
}
pos
++
;
pos
++
;
}
}
// Third loop: single pairs that aren't part of a tile.
#if USE_CUTOFF
const
unsigned
int
numPairs
=
singlePairCount
[
0
];
if
(
numPairs
>
maxSinglePairs
)
return
;
// There wasn't enough memory for the neighbor list.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
numPairs
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int2
pair
=
singlePairs
[
i
];
int
atom1
=
pair
.
x
;
int
atom2
=
pair
.
y
;
real4
posq1
=
posq
[
atom1
];
real4
posq2
=
posq
[
atom2
];
LOAD_ATOM1_PARAMETERS
int
j
=
atom2
;
atom2
=
threadIdx
.
x
;
DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_ATOM2_PARAMETERS
atom2
=
pair
.
y
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
#ifdef USE_SYMMETRIC
real
dEdR
=
0.0
f
;
#else
real3
dEdR1
=
make_real3
(
0
);
real3
dEdR2
=
make_real3
(
0
);
#endif
bool
hasExclusions
=
false
;
bool
isExcluded
=
false
;
real
tempEnergy
=
0.0
f
;
const
real
interactionScale
=
1.0
f
;
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
real3
dEdR1
=
delta
*
dEdR
;
real3
dEdR2
=
-
dEdR1
;
#endif
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR1
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR1
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR1
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR2
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR2
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR2
.
z
*
0x100000000
)));
#endif
}
#endif
#ifdef INCLUDE_ENERGY
#ifdef INCLUDE_ENERGY
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
#endif
#endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment