Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
5fa4345f
"vscode:/vscode.git/clone" did not exist on "dbb4827e6569c0c1b2eb42ad9bd40e17b400a40a"
Commit
5fa4345f
authored
Sep 14, 2016
by
Peter Eastman
Browse files
Beginning of implementing fine grained pair list
parent
10b51d25
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
211 additions
and
29 deletions
+211
-29
platforms/cuda/include/CudaNonbondedUtilities.h
platforms/cuda/include/CudaNonbondedUtilities.h
+3
-1
platforms/cuda/src/CudaNonbondedUtilities.cpp
platforms/cuda/src/CudaNonbondedUtilities.cpp
+46
-19
platforms/cuda/src/kernels/findInteractingBlocks.cu
platforms/cuda/src/kernels/findInteractingBlocks.cu
+107
-8
platforms/cuda/src/kernels/nonbonded.cu
platforms/cuda/src/kernels/nonbonded.cu
+55
-1
No files found.
platforms/cuda/include/CudaNonbondedUtilities.h
View file @
5fa4345f
...
@@ -270,6 +270,8 @@ private:
...
@@ -270,6 +270,8 @@ private:
CudaArray
*
interactingTiles
;
CudaArray
*
interactingTiles
;
CudaArray
*
interactingAtoms
;
CudaArray
*
interactingAtoms
;
CudaArray
*
interactionCount
;
CudaArray
*
interactionCount
;
CudaArray
*
singlePairs
;
CudaArray
*
singlePairCount
;
CudaArray
*
blockCenter
;
CudaArray
*
blockCenter
;
CudaArray
*
blockBoundingBox
;
CudaArray
*
blockBoundingBox
;
CudaArray
*
sortedBlocks
;
CudaArray
*
sortedBlocks
;
...
@@ -289,7 +291,7 @@ private:
...
@@ -289,7 +291,7 @@ private:
std
::
map
<
int
,
std
::
string
>
groupKernelSource
;
std
::
map
<
int
,
std
::
string
>
groupKernelSource
;
double
lastCutoff
;
double
lastCutoff
;
bool
useCutoff
,
usePeriodic
,
anyExclusions
,
usePadding
,
forceRebuildNeighborList
;
bool
useCutoff
,
usePeriodic
,
anyExclusions
,
usePadding
,
forceRebuildNeighborList
;
int
startTileIndex
,
numTiles
,
startBlockIndex
,
numBlocks
,
maxTiles
,
maxExclusions
,
numForceThreadBlocks
,
forceThreadBlockSize
,
numAtoms
,
groupFlags
;
int
startTileIndex
,
numTiles
,
startBlockIndex
,
numBlocks
,
maxTiles
,
maxSinglePairs
,
maxExclusions
,
numForceThreadBlocks
,
forceThreadBlockSize
,
numAtoms
,
groupFlags
;
};
};
/**
/**
...
...
platforms/cuda/src/CudaNonbondedUtilities.cpp
View file @
5fa4345f
...
@@ -64,7 +64,7 @@ private:
...
@@ -64,7 +64,7 @@ private:
CudaNonbondedUtilities
::
CudaNonbondedUtilities
(
CudaContext
&
context
)
:
context
(
context
),
useCutoff
(
false
),
usePeriodic
(
false
),
anyExclusions
(
false
),
usePadding
(
true
),
CudaNonbondedUtilities
::
CudaNonbondedUtilities
(
CudaContext
&
context
)
:
context
(
context
),
useCutoff
(
false
),
usePeriodic
(
false
),
anyExclusions
(
false
),
usePadding
(
true
),
exclusionIndices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusionTiles
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactingAtoms
(
NULL
),
exclusionIndices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusionTiles
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactingAtoms
(
NULL
),
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
sortedBlocks
(
NULL
),
sortedBlockCenter
(
NULL
),
sortedBlockBoundingBox
(
NULL
),
interactionCount
(
NULL
),
singlePairs
(
NULL
),
singlePairCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
sortedBlocks
(
NULL
),
sortedBlockCenter
(
NULL
),
sortedBlockBoundingBox
(
NULL
),
oldPositions
(
NULL
),
rebuildNeighborList
(
NULL
),
blockSorter
(
NULL
),
pinnedCountBuffer
(
NULL
),
forceRebuildNeighborList
(
true
),
lastCutoff
(
0.0
),
groupFlags
(
0
)
{
oldPositions
(
NULL
),
rebuildNeighborList
(
NULL
),
blockSorter
(
NULL
),
pinnedCountBuffer
(
NULL
),
forceRebuildNeighborList
(
true
),
lastCutoff
(
0.0
),
groupFlags
(
0
)
{
// Decide how many thread blocks to use.
// Decide how many thread blocks to use.
...
@@ -72,7 +72,7 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c
...
@@ -72,7 +72,7 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c
int
multiprocessors
;
int
multiprocessors
;
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
context
.
getDevice
()));
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
context
.
getDevice
()));
CHECK_RESULT
(
cuEventCreate
(
&
downloadCountEvent
,
0
));
CHECK_RESULT
(
cuEventCreate
(
&
downloadCountEvent
,
0
));
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
pinnedCountBuffer
,
sizeof
(
int
),
CU_MEMHOSTALLOC_PORTABLE
));
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
pinnedCountBuffer
,
2
*
sizeof
(
int
),
CU_MEMHOSTALLOC_PORTABLE
));
numForceThreadBlocks
=
4
*
multiprocessors
;
numForceThreadBlocks
=
4
*
multiprocessors
;
forceThreadBlockSize
=
(
context
.
getComputeCapability
()
<
2.0
?
128
:
256
);
forceThreadBlockSize
=
(
context
.
getComputeCapability
()
<
2.0
?
128
:
256
);
}
}
...
@@ -92,6 +92,10 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
...
@@ -92,6 +92,10 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
delete
interactingAtoms
;
delete
interactingAtoms
;
if
(
interactionCount
!=
NULL
)
if
(
interactionCount
!=
NULL
)
delete
interactionCount
;
delete
interactionCount
;
if
(
singlePairs
!=
NULL
)
delete
singlePairs
;
if
(
singlePairCount
!=
NULL
)
delete
singlePairCount
;
if
(
blockCenter
!=
NULL
)
if
(
blockCenter
!=
NULL
)
delete
blockCenter
;
delete
blockCenter
;
if
(
blockBoundingBox
!=
NULL
)
if
(
blockBoundingBox
!=
NULL
)
...
@@ -279,9 +283,12 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -279,9 +283,12 @@ void CudaNonbondedUtilities::initialize(const System& system) {
maxTiles
=
numTiles
;
maxTiles
=
numTiles
;
if
(
maxTiles
<
1
)
if
(
maxTiles
<
1
)
maxTiles
=
1
;
maxTiles
=
1
;
maxSinglePairs
=
5
*
numAtoms
;
interactingTiles
=
CudaArray
::
create
<
int
>
(
context
,
maxTiles
,
"interactingTiles"
);
interactingTiles
=
CudaArray
::
create
<
int
>
(
context
,
maxTiles
,
"interactingTiles"
);
interactingAtoms
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interactingAtoms"
);
interactingAtoms
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interactingAtoms"
);
interactionCount
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
1
,
"interactionCount"
);
interactionCount
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
1
,
"interactionCount"
);
singlePairs
=
CudaArray
::
create
<
int2
>
(
context
,
maxSinglePairs
,
"singlePairs"
);
singlePairCount
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
1
,
"singlePairCount"
);
int
elementSize
=
(
context
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
int
elementSize
=
(
context
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
blockCenter
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockCenter"
);
blockCenter
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockCenter"
);
blockBoundingBox
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockBoundingBox"
);
blockBoundingBox
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockBoundingBox"
);
...
@@ -293,6 +300,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -293,6 +300,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
blockSorter
=
new
CudaSort
(
context
,
new
BlockSortTrait
(
context
.
getUseDoublePrecision
()),
numAtomBlocks
);
blockSorter
=
new
CudaSort
(
context
,
new
BlockSortTrait
(
context
.
getUseDoublePrecision
()),
numAtomBlocks
);
vector
<
unsigned
int
>
count
(
1
,
0
);
vector
<
unsigned
int
>
count
(
1
,
0
);
interactionCount
->
upload
(
count
);
interactionCount
->
upload
(
count
);
singlePairCount
->
upload
(
count
);
}
}
// Record arguments for kernels.
// Record arguments for kernels.
...
@@ -316,6 +324,9 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -316,6 +324,9 @@ void CudaNonbondedUtilities::initialize(const System& system) {
forceArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
forceArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
forceArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
forceArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
forceArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
forceArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
forceArgs
.
push_back
(
&
maxSinglePairs
);
forceArgs
.
push_back
(
&
singlePairCount
->
getDevicePointer
());
forceArgs
.
push_back
(
&
singlePairs
->
getDevicePointer
());
}
}
for
(
int
i
=
0
;
i
<
(
int
)
parameters
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
parameters
.
size
();
i
++
)
forceArgs
.
push_back
(
&
parameters
[
i
].
getMemory
());
forceArgs
.
push_back
(
&
parameters
[
i
].
getMemory
());
...
@@ -343,6 +354,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -343,6 +354,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
sortBoxDataArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
oldPositions
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
oldPositions
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
singlePairCount
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
rebuildNeighborList
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
rebuildNeighborList
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
forceRebuildNeighborList
);
sortBoxDataArgs
.
push_back
(
&
forceRebuildNeighborList
);
findInteractingBlocksArgs
.
push_back
(
context
.
getPeriodicBoxSizePointer
());
findInteractingBlocksArgs
.
push_back
(
context
.
getPeriodicBoxSizePointer
());
...
@@ -353,8 +365,11 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -353,8 +365,11 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingTiles
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingTiles
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
singlePairCount
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
singlePairs
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
maxTiles
);
findInteractingBlocksArgs
.
push_back
(
&
maxTiles
);
findInteractingBlocksArgs
.
push_back
(
&
maxSinglePairs
);
findInteractingBlocksArgs
.
push_back
(
&
startBlockIndex
);
findInteractingBlocksArgs
.
push_back
(
&
startBlockIndex
);
findInteractingBlocksArgs
.
push_back
(
&
numBlocks
);
findInteractingBlocksArgs
.
push_back
(
&
numBlocks
);
findInteractingBlocksArgs
.
push_back
(
&
sortedBlocks
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
sortedBlocks
->
getDevicePointer
());
...
@@ -402,6 +417,7 @@ void CudaNonbondedUtilities::prepareInteractions(int forceGroups) {
...
@@ -402,6 +417,7 @@ void CudaNonbondedUtilities::prepareInteractions(int forceGroups) {
forceRebuildNeighborList
=
false
;
forceRebuildNeighborList
=
false
;
lastCutoff
=
kernels
.
cutoffDistance
;
lastCutoff
=
kernels
.
cutoffDistance
;
interactionCount
->
download
(
pinnedCountBuffer
,
false
);
interactionCount
->
download
(
pinnedCountBuffer
,
false
);
singlePairCount
->
download
(
pinnedCountBuffer
+
1
,
false
);
cuEventRecord
(
downloadCountEvent
,
context
.
getCurrentStream
());
cuEventRecord
(
downloadCountEvent
,
context
.
getCurrentStream
());
}
}
...
@@ -424,12 +440,13 @@ void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeFo
...
@@ -424,12 +440,13 @@ void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeFo
bool
CudaNonbondedUtilities
::
updateNeighborListSize
()
{
bool
CudaNonbondedUtilities
::
updateNeighborListSize
()
{
if
(
!
useCutoff
)
if
(
!
useCutoff
)
return
false
;
return
false
;
if
(
pinnedCountBuffer
[
0
]
<=
(
unsigned
int
)
maxTile
s
)
if
(
pinnedCountBuffer
[
0
]
<=
maxTiles
&&
pinnedCountBuffer
[
1
]
<=
maxSinglePair
s
)
return
false
;
return
false
;
// The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent
// The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent
// this from happening in the future.
// this from happening in the future.
if
(
pinnedCountBuffer
[
0
]
>
maxTiles
)
{
maxTiles
=
(
int
)
(
1.2
*
pinnedCountBuffer
[
0
]);
maxTiles
=
(
int
)
(
1.2
*
pinnedCountBuffer
[
0
]);
int
totalTiles
=
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
;
int
totalTiles
=
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
;
if
(
maxTiles
>
totalTiles
)
if
(
maxTiles
>
totalTiles
)
...
@@ -446,6 +463,16 @@ bool CudaNonbondedUtilities::updateNeighborListSize() {
...
@@ -446,6 +463,16 @@ bool CudaNonbondedUtilities::updateNeighborListSize() {
if
(
forceArgs
.
size
()
>
0
)
if
(
forceArgs
.
size
()
>
0
)
forceArgs
[
17
]
=
&
interactingAtoms
->
getDevicePointer
();
forceArgs
[
17
]
=
&
interactingAtoms
->
getDevicePointer
();
findInteractingBlocksArgs
[
7
]
=
&
interactingAtoms
->
getDevicePointer
();
findInteractingBlocksArgs
[
7
]
=
&
interactingAtoms
->
getDevicePointer
();
}
if
(
pinnedCountBuffer
[
1
]
>
maxSinglePairs
)
{
maxSinglePairs
=
(
int
)
(
1.2
*
pinnedCountBuffer
[
1
]);
delete
singlePairs
;
singlePairs
=
NULL
;
// Avoid an error in the destructor if the following allocation fails
singlePairs
=
CudaArray
::
create
<
int2
>
(
context
,
maxSinglePairs
,
"singlePairs"
);
if
(
forceArgs
.
size
()
>
0
)
forceArgs
[
20
]
=
&
singlePairs
->
getDevicePointer
();
findInteractingBlocksArgs
[
9
]
=
&
singlePairs
->
getDevicePointer
();
}
forceRebuildNeighborList
=
true
;
forceRebuildNeighborList
=
true
;
context
.
setForcesValid
(
false
);
context
.
setForcesValid
(
false
);
return
true
;
return
true
;
...
...
platforms/cuda/src/kernels/findInteractingBlocks.cu
View file @
5fa4345f
...
@@ -53,7 +53,7 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
...
@@ -53,7 +53,7 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
extern
"C"
__global__
void
sortBoxData
(
const
real2
*
__restrict__
sortedBlock
,
const
real4
*
__restrict__
blockCenter
,
extern
"C"
__global__
void
sortBoxData
(
const
real2
*
__restrict__
sortedBlock
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockBoundingBox
,
real4
*
__restrict__
sortedBlockCenter
,
const
real4
*
__restrict__
blockBoundingBox
,
real4
*
__restrict__
sortedBlockCenter
,
real4
*
__restrict__
sortedBlockBoundingBox
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
oldPositions
,
real4
*
__restrict__
sortedBlockBoundingBox
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
oldPositions
,
unsigned
int
*
__restrict__
interactionCount
,
int
*
__restrict__
rebuildNeighborList
,
bool
forceRebuild
)
{
unsigned
int
*
__restrict__
interactionCount
,
unsigned
int
*
__restrict__
singlePairCount
,
int
*
__restrict__
rebuildNeighborList
,
bool
forceRebuild
)
{
for
(
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
NUM_BLOCKS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
NUM_BLOCKS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
index
=
(
int
)
sortedBlock
[
i
].
y
;
int
index
=
(
int
)
sortedBlock
[
i
].
y
;
sortedBlockCenter
[
i
]
=
blockCenter
[
index
];
sortedBlockCenter
[
i
]
=
blockCenter
[
index
];
...
@@ -71,9 +71,96 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
...
@@ -71,9 +71,96 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
if
(
rebuild
)
{
if
(
rebuild
)
{
rebuildNeighborList
[
0
]
=
1
;
rebuildNeighborList
[
0
]
=
1
;
interactionCount
[
0
]
=
0
;
interactionCount
[
0
]
=
0
;
singlePairCount
[
0
]
=
0
;
}
}
}
}
__device__
int
sortBlockAtoms
(
int
x
,
int
*
atoms
,
int
*
flags
,
int
length
,
unsigned
int
maxSinglePairs
,
unsigned
int
*
singlePairCount
,
int2
*
singlePairs
,
int
*
sumBuffer
,
volatile
int
&
pairStartIndex
)
{
const
int
indexInWarp
=
threadIdx
.
x
%
32
;
const
int
maxBitsForPairs
=
2
;
int
sum
=
0
;
for
(
int
i
=
indexInWarp
;
i
<
length
;
i
+=
32
)
{
int
count
=
__popc
(
flags
[
i
]);
sum
+=
(
count
<=
maxBitsForPairs
?
count
:
0
);
}
sumBuffer
[
indexInWarp
]
=
sum
;
for
(
int
step
=
1
;
step
<
32
;
step
*=
2
)
{
int
add
=
(
indexInWarp
>=
step
?
sumBuffer
[
indexInWarp
-
step
]
:
0
);
sumBuffer
[
indexInWarp
]
+=
add
;
}
int
pairsToStore
=
sumBuffer
[
31
];
if
(
indexInWarp
==
0
)
pairStartIndex
=
atomicAdd
(
singlePairCount
,
pairsToStore
);
int
pairIndex
=
pairStartIndex
+
(
indexInWarp
>
0
?
sumBuffer
[
indexInWarp
-
1
]
:
0
);
for
(
int
i
=
indexInWarp
;
i
<
length
;
i
+=
32
)
{
int
count
=
__popc
(
flags
[
i
]);
if
(
count
<=
maxBitsForPairs
&&
pairIndex
+
count
<
maxSinglePairs
)
{
int
f
=
flags
[
i
];
while
(
f
!=
0
)
{
singlePairs
[
pairIndex
]
=
make_int2
(
atoms
[
i
],
x
*
TILE_SIZE
+
__ffs
(
f
)
-
1
);
f
&=
f
-
1
;
pairIndex
++
;
}
}
}
// sum = 0;
// for (int i = indexInWarp; i < length; i += 32) {
// int count = __popc(flags[i]);
// sum += (count <= maxBitsForPairs ? 1 : 0);
// }
// sumBuffer[indexInWarp] = sum;
// for (int step = 1; step < 32; step *= 2) {
// int add = (indexInWarp >= step ? sumBuffer[indexInWarp-step] : 0);
// sumBuffer[indexInWarp] += add;
// }
//
//
// for (unsigned int k = 2; k < 2*length; k *= 2) {
// for (unsigned int j = k/2; j > 0; j /= 2) {
// for (unsigned int i = indexInWarp; i < length; i += 32) {
// int ixj = i^j;
// if (ixj > i && ixj < length) {
// int key1 = __popc(flags[i]);
// int key2 = __popc(flags[ixj]);
// bool ascending = ((i&k) == 0);
// for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
// ascending = ((i&mask) == 0 ? !ascending : ascending);
// int lowKey = (ascending ? key1 : key2);
// int highKey = (ascending ? key2 : key1);
// if (lowKey < highKey) {
// int tempAtom = atoms[i];
// int tempFlags = flags[i];
// atoms[i] = atoms[ixj];
// flags[i] = flags[ixj];
// atoms[ixj] = tempAtom;
// flags[ixj] = tempFlags;
// }
// }
// }
// }
// }
// return length-sumBuffer[31];
const
int
warpMask
=
(
1
<<
indexInWarp
)
-
1
;
int
numCompacted
=
0
;
for
(
int
i
=
indexInWarp
;
i
<
BUFFER_SIZE
;
i
+=
32
)
{
int
atom
=
atoms
[
i
];
int
flag
=
flags
[
i
];
bool
include
=
(
i
<
length
&&
__popc
(
flags
[
i
])
>
maxBitsForPairs
);
int
includeFlags
=
__ballot
(
include
);
if
(
include
)
{
int
index
=
numCompacted
+
__popc
(
includeFlags
&
warpMask
);
atoms
[
index
]
=
atom
;
flags
[
index
]
=
flag
;
}
numCompacted
+=
__popc
(
includeFlags
);
}
return
numCompacted
;
}
/**
/**
* Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two
* Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two
* atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
* atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
...
@@ -124,8 +211,9 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
...
@@ -124,8 +211,9 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
*
*
*/
*/
extern
"C"
__global__
void
findBlocksWithInteractions
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
extern
"C"
__global__
void
findBlocksWithInteractions
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
*
__restrict__
interactionCount
,
int
*
__restrict__
interactingTiles
,
unsigned
int
*
__restrict__
interactingAtoms
,
const
real4
*
__restrict__
posq
,
unsigned
int
*
__restrict__
interactionCount
,
int
*
__restrict__
interactingTiles
,
unsigned
int
*
__restrict__
interactingAtoms
,
unsigned
int
maxTiles
,
unsigned
int
startBlockIndex
,
unsigned
int
numBlocks
,
real2
*
__restrict__
sortedBlocks
,
const
real4
*
__restrict__
sortedBlockCenter
,
unsigned
int
*
__restrict__
singlePairCount
,
int2
*
__restrict__
singlePairs
,
const
real4
*
__restrict__
posq
,
unsigned
int
maxTiles
,
unsigned
int
maxSinglePairs
,
unsigned
int
startBlockIndex
,
unsigned
int
numBlocks
,
real2
*
__restrict__
sortedBlocks
,
const
real4
*
__restrict__
sortedBlockCenter
,
const
real4
*
__restrict__
sortedBlockBoundingBox
,
const
unsigned
int
*
__restrict__
exclusionIndices
,
const
unsigned
int
*
__restrict__
exclusionRowIndices
,
const
real4
*
__restrict__
sortedBlockBoundingBox
,
const
unsigned
int
*
__restrict__
exclusionIndices
,
const
unsigned
int
*
__restrict__
exclusionRowIndices
,
real4
*
__restrict__
oldPositions
,
const
int
*
__restrict__
rebuildNeighborList
)
{
real4
*
__restrict__
oldPositions
,
const
int
*
__restrict__
rebuildNeighborList
)
{
...
@@ -138,12 +226,17 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
...
@@ -138,12 +226,17 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
const
int
warpIndex
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
32
;
const
int
warpIndex
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
32
;
const
int
warpMask
=
(
1
<<
indexInWarp
)
-
1
;
const
int
warpMask
=
(
1
<<
indexInWarp
)
-
1
;
__shared__
int
workgroupBuffer
[
BUFFER_SIZE
*
(
GROUP_SIZE
/
32
)];
__shared__
int
workgroupBuffer
[
BUFFER_SIZE
*
(
GROUP_SIZE
/
32
)];
__shared__
int
workgroupFlagsBuffer
[
BUFFER_SIZE
*
(
GROUP_SIZE
/
32
)];
__shared__
int
warpExclusions
[
MAX_EXCLUSIONS
*
(
GROUP_SIZE
/
32
)];
__shared__
int
warpExclusions
[
MAX_EXCLUSIONS
*
(
GROUP_SIZE
/
32
)];
__shared__
real3
posBuffer
[
GROUP_SIZE
];
__shared__
real3
posBuffer
[
GROUP_SIZE
];
__shared__
volatile
int
workgroupTileIndex
[
GROUP_SIZE
/
32
];
__shared__
volatile
int
workgroupTileIndex
[
GROUP_SIZE
/
32
];
__shared__
int
sumBuffer
[
GROUP_SIZE
];
__shared__
int
worksgroupPairStartIndex
[
GROUP_SIZE
/
32
];
int
*
buffer
=
workgroupBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
buffer
=
workgroupBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
flagsBuffer
=
workgroupFlagsBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
exclusionsForX
=
warpExclusions
+
MAX_EXCLUSIONS
*
(
warpStart
/
32
);
int
*
exclusionsForX
=
warpExclusions
+
MAX_EXCLUSIONS
*
(
warpStart
/
32
);
volatile
int
&
tileStartIndex
=
workgroupTileIndex
[
warpStart
/
32
];
volatile
int
&
tileStartIndex
=
workgroupTileIndex
[
warpStart
/
32
];
volatile
int
&
pairStartIndex
=
worksgroupPairStartIndex
[
warpStart
/
32
];
// Loop over blocks.
// Loop over blocks.
...
@@ -227,7 +320,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
...
@@ -227,7 +320,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
APPLY_PERIODIC_TO_DELTA
(
atomDelta
)
APPLY_PERIODIC_TO_DELTA
(
atomDelta
)
#endif
#endif
int
atomFlags
=
ballot
(
atomDelta
.
x
*
atomDelta
.
x
+
atomDelta
.
y
*
atomDelta
.
y
+
atomDelta
.
z
*
atomDelta
.
z
<
(
PADDED_CUTOFF
+
blockCenterY
.
w
)
*
(
PADDED_CUTOFF
+
blockCenterY
.
w
));
int
atomFlags
=
ballot
(
atomDelta
.
x
*
atomDelta
.
x
+
atomDelta
.
y
*
atomDelta
.
y
+
atomDelta
.
z
*
atomDelta
.
z
<
(
PADDED_CUTOFF
+
blockCenterY
.
w
)
*
(
PADDED_CUTOFF
+
blockCenterY
.
w
));
bool
interacts
=
false
;
int
interacts
=
0
;
if
(
atom2
<
NUM_ATOMS
&&
atomFlags
!=
0
)
{
if
(
atom2
<
NUM_ATOMS
&&
atomFlags
!=
0
)
{
int
first
=
__ffs
(
atomFlags
)
-
1
;
int
first
=
__ffs
(
atomFlags
)
-
1
;
int
last
=
32
-
__clz
(
atomFlags
);
int
last
=
32
-
__clz
(
atomFlags
);
...
@@ -236,14 +329,14 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
...
@@ -236,14 +329,14 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
for
(
int
j
=
first
;
j
<
last
;
j
++
)
{
for
(
int
j
=
first
;
j
<
last
;
j
++
)
{
real3
delta
=
pos2
-
posBuffer
[
warpStart
+
j
];
real3
delta
=
pos2
-
posBuffer
[
warpStart
+
j
];
APPLY_PERIODIC_TO_DELTA
(
delta
)
APPLY_PERIODIC_TO_DELTA
(
delta
)
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
);
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
?
1
<<
j
:
0
);
}
}
}
}
else
{
else
{
#endif
#endif
for
(
int
j
=
first
;
j
<
last
;
j
++
)
{
for
(
int
j
=
first
;
j
<
last
;
j
++
)
{
real3
delta
=
pos2
-
posBuffer
[
warpStart
+
j
];
real3
delta
=
pos2
-
posBuffer
[
warpStart
+
j
];
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
);
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
?
1
<<
j
:
0
);
}
}
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
}
}
...
@@ -253,12 +346,16 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
...
@@ -253,12 +346,16 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
// Add any interacting atoms to the buffer.
// Add any interacting atoms to the buffer.
int
includeAtomFlags
=
__ballot
(
interacts
);
int
includeAtomFlags
=
__ballot
(
interacts
);
if
(
interacts
)
if
(
interacts
)
{
buffer
[
neighborsInBuffer
+
__popc
(
includeAtomFlags
&
warpMask
)]
=
atom2
;
int
index
=
neighborsInBuffer
+
__popc
(
includeAtomFlags
&
warpMask
);
buffer
[
index
]
=
atom2
;
flagsBuffer
[
index
]
=
interacts
;
}
neighborsInBuffer
+=
__popc
(
includeAtomFlags
);
neighborsInBuffer
+=
__popc
(
includeAtomFlags
);
if
(
neighborsInBuffer
>
BUFFER_SIZE
-
TILE_SIZE
)
{
if
(
neighborsInBuffer
>
BUFFER_SIZE
-
TILE_SIZE
)
{
// Store the new tiles to memory.
// Store the new tiles to memory.
neighborsInBuffer
=
sortBlockAtoms
(
x
,
buffer
,
flagsBuffer
,
neighborsInBuffer
,
maxSinglePairs
,
singlePairCount
,
singlePairs
,
sumBuffer
+
warpStart
,
pairStartIndex
);
int
tilesToStore
=
neighborsInBuffer
/
TILE_SIZE
;
int
tilesToStore
=
neighborsInBuffer
/
TILE_SIZE
;
if
(
indexInWarp
==
0
)
if
(
indexInWarp
==
0
)
tileStartIndex
=
atomicAdd
(
interactionCount
,
tilesToStore
);
tileStartIndex
=
atomicAdd
(
interactionCount
,
tilesToStore
);
...
@@ -277,6 +374,8 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
...
@@ -277,6 +374,8 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
// If we have a partially filled buffer, store it to memory.
// If we have a partially filled buffer, store it to memory.
if
(
neighborsInBuffer
>
32
)
neighborsInBuffer
=
sortBlockAtoms
(
x
,
buffer
,
flagsBuffer
,
neighborsInBuffer
,
maxSinglePairs
,
singlePairCount
,
singlePairs
,
sumBuffer
+
warpStart
,
pairStartIndex
);
if
(
neighborsInBuffer
>
0
)
{
if
(
neighborsInBuffer
>
0
)
{
int
tilesToStore
=
(
neighborsInBuffer
+
TILE_SIZE
-
1
)
/
TILE_SIZE
;
int
tilesToStore
=
(
neighborsInBuffer
+
TILE_SIZE
-
1
)
/
TILE_SIZE
;
if
(
indexInWarp
==
0
)
if
(
indexInWarp
==
0
)
...
...
platforms/cuda/src/kernels/nonbonded.cu
View file @
5fa4345f
...
@@ -105,7 +105,8 @@ extern "C" __global__ void computeNonbonded(
...
@@ -105,7 +105,8 @@ extern "C" __global__ void computeNonbonded(
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
,
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
,
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
,
unsigned
int
maxSinglePairs
,
const
int
*
__restrict__
singlePairCount
,
const
int2
*
__restrict__
singlePairs
#endif
#endif
PARAMETER_ARGUMENTS
)
{
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
...
@@ -588,6 +589,59 @@ extern "C" __global__ void computeNonbonded(
...
@@ -588,6 +589,59 @@ extern "C" __global__ void computeNonbonded(
}
}
pos
++
;
pos
++
;
}
}
// Third loop: single pairs that aren't part of a tile.
#if USE_CUTOFF
const
unsigned
int
numPairs
=
singlePairCount
[
0
];
if
(
numPairs
>
maxSinglePairs
)
return
;
// There wasn't enough memory for the neighbor list.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
numPairs
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int2
pair
=
singlePairs
[
i
];
int
atom1
=
pair
.
x
;
int
atom2
=
pair
.
y
;
real4
posq1
=
posq
[
atom1
];
real4
posq2
=
posq
[
atom2
];
LOAD_ATOM1_PARAMETERS
int
j
=
atom2
;
atom2
=
threadIdx
.
x
;
DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_ATOM2_PARAMETERS
atom2
=
pair
.
y
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
#ifdef USE_SYMMETRIC
real
dEdR
=
0.0
f
;
#else
real3
dEdR1
=
make_real3
(
0
);
real3
dEdR2
=
make_real3
(
0
);
#endif
bool
hasExclusions
=
false
;
bool
isExcluded
=
false
;
real
tempEnergy
=
0.0
f
;
const
real
interactionScale
=
1.0
f
;
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
real3
dEdR1
=
delta
*
dEdR
;
real3
dEdR2
=
-
dEdR1
;
#endif
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR1
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR1
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR1
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR2
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR2
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR2
.
z
*
0x100000000
)));
#endif
}
#endif
#ifdef INCLUDE_ENERGY
#ifdef INCLUDE_ENERGY
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
#endif
#endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment