Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
e3b631f6
Commit
e3b631f6
authored
Aug 20, 2014
by
peastman
Browse files
Wrote code to build neighbor list for CUDA version of CustomManyParticleForce
parent
786e1ee7
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
344 additions
and
10 deletions
+344
-10
openmmapi/include/openmm/CustomManyParticleForce.h
openmmapi/include/openmm/CustomManyParticleForce.h
+2
-2
openmmapi/src/CustomManyParticleForce.cpp
openmmapi/src/CustomManyParticleForce.cpp
+1
-1
platforms/cuda/include/CudaKernels.h
platforms/cuda/include/CudaKernels.h
+13
-3
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+126
-3
platforms/cuda/src/kernels/customManyParticle.cu
platforms/cuda/src/kernels/customManyParticle.cu
+202
-1
No files found.
openmmapi/include/openmm/CustomManyParticleForce.h
View file @
e3b631f6
...
...
@@ -85,11 +85,11 @@ namespace OpenMM {
* As an example, the following code creates a CustomManyParticleForce that implements an Axilrod-Teller potential. This
* is an interaction between three particles that depends on all three distances and angles formed by the particles.
*
* <tt>CustomManyParticleForce* force = new CustomManyParticleForce(3,
* <tt>
<pre>
CustomManyParticleForce* force = new CustomManyParticleForce(3,
* "C*(1+3*cos(theta1)*cos(theta2)*cos(theta3))/(r12*r13*r23)^3;"
* "theta1=angle(p1,p2,p3); theta2=angle(p2,p3,p1); theta3=angle(p3,p1,p2);"
* "r12=distance(p1,p2); r13=distance(p1,p3); r23=distance(p2,p3)");
* </tt>
*
<pre>
</tt>
*
* This force depends on one parameter, C. The following code defines it as a global parameter:
*
...
...
openmmapi/src/CustomManyParticleForce.cpp
View file @
e3b631f6
...
...
@@ -44,7 +44,7 @@ using namespace OpenMM;
using
namespace
std
;
CustomManyParticleForce
::
CustomManyParticleForce
(
int
particlesPerSet
,
const
string
&
energy
)
:
particlesPerSet
(
particlesPerSet
),
energyExpression
(
energy
),
nonbondedMethod
(
NoCutoff
),
typeFilters
(
particlesPerSet
)
{
particlesPerSet
(
particlesPerSet
),
energyExpression
(
energy
),
nonbondedMethod
(
NoCutoff
),
cutoffDistance
(
1.0
),
typeFilters
(
particlesPerSet
)
{
}
CustomManyParticleForce
::~
CustomManyParticleForce
()
{
...
...
platforms/cuda/include/CudaKernels.h
View file @
e3b631f6
...
...
@@ -932,7 +932,8 @@ class CudaCalcCustomManyParticleForceKernel : public CalcCustomManyParticleForce
public:
CudaCalcCustomManyParticleForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
const
System
&
system
)
:
CalcCustomManyParticleForceKernel
(
name
,
platform
),
hasInitializedKernel
(
false
),
cu
(
cu
),
params
(
NULL
),
globals
(
NULL
),
particleTypes
(
NULL
),
orderIndex
(
NULL
),
particleOrder
(
NULL
),
exclusions
(
NULL
),
exclusionStartIndex
(
NULL
),
system
(
system
)
{
exclusionStartIndex
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
neighborPairs
(
NULL
),
numNeighborPairs
(
NULL
),
neighborStartIndex
(
NULL
),
numNeighborsForAtom
(
NULL
),
neighbors
(
NULL
),
system
(
system
)
{
}
~
CudaCalcCustomManyParticleForceKernel
();
/**
...
...
@@ -963,6 +964,7 @@ private:
CudaContext
&
cu
;
bool
hasInitializedKernel
;
NonbondedMethod
nonbondedMethod
;
int
maxNeighborPairs
;
CudaParameterSet
*
params
;
CudaArray
*
globals
;
CudaArray
*
particleTypes
;
...
...
@@ -970,12 +972,20 @@ private:
CudaArray
*
particleOrder
;
CudaArray
*
exclusions
;
CudaArray
*
exclusionStartIndex
;
CudaArray
*
blockCenter
;
CudaArray
*
blockBoundingBox
;
CudaArray
*
neighborPairs
;
CudaArray
*
numNeighborPairs
;
CudaArray
*
neighborStartIndex
;
CudaArray
*
numNeighborsForAtom
;
CudaArray
*
neighbors
;
std
::
vector
<
std
::
string
>
globalParamNames
;
std
::
vector
<
float
>
globalParamValues
;
std
::
vector
<
CudaArray
*>
tabulatedFunctions
;
std
::
vector
<
void
*>
forceArgs
;
std
::
vector
<
void
*>
forceArgs
,
blockBoundsArgs
,
neighborsArgs
,
startIndicesArgs
,
copyPairsArgs
;
const
System
&
system
;
CUfunction
forceKernel
;
CUfunction
forceKernel
,
blockBoundsKernel
,
neighborsKernel
,
startIndicesKernel
,
copyPairsKernel
;
CUevent
event
;
};
/**
...
...
platforms/cuda/src/CudaKernels.cpp
View file @
e3b631f6
...
...
@@ -56,6 +56,13 @@ using namespace std;
using
Lepton
::
ExpressionTreeNode
;
using
Lepton
::
Operation
;
#define CHECK_RESULT(result, prefix) \
if (result != CUDA_SUCCESS) { \
std::stringstream m; \
m<<prefix<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
static
bool
isZeroExpression
(
const
Lepton
::
ParsedExpression
&
expression
)
{
const
Lepton
::
Operation
&
op
=
expression
.
getRootNode
().
getOperation
();
if
(
op
.
getId
()
!=
Lepton
::
Operation
::
CONSTANT
)
...
...
@@ -4445,6 +4452,20 @@ CudaCalcCustomManyParticleForceKernel::~CudaCalcCustomManyParticleForceKernel()
delete
exclusions
;
if
(
exclusionStartIndex
!=
NULL
)
delete
exclusionStartIndex
;
if
(
blockCenter
!=
NULL
)
delete
blockCenter
;
if
(
blockBoundingBox
!=
NULL
)
delete
blockBoundingBox
;
if
(
neighborPairs
!=
NULL
)
delete
neighborPairs
;
if
(
numNeighborPairs
!=
NULL
)
delete
numNeighborPairs
;
if
(
neighborStartIndex
!=
NULL
)
delete
neighborStartIndex
;
if
(
neighbors
!=
NULL
)
delete
neighbors
;
if
(
numNeighborsForAtom
!=
NULL
)
delete
numNeighborsForAtom
;
for
(
int
i
=
0
;
i
<
(
int
)
tabulatedFunctions
.
size
();
i
++
)
delete
tabulatedFunctions
[
i
];
}
...
...
@@ -4570,6 +4591,26 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
exclusions
->
upload
(
exclusionsVec
);
exclusionStartIndex
->
upload
(
exclusionStartIndexVec
);
}
// Build data structures for the neighbor list.
if
(
nonbondedMethod
!=
NoCutoff
)
{
int
numAtomBlocks
=
cu
.
getNumAtomBlocks
();
int
elementSize
=
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
blockCenter
=
new
CudaArray
(
cu
,
numAtomBlocks
,
4
*
elementSize
,
"blockCenter"
);
blockBoundingBox
=
new
CudaArray
(
cu
,
numAtomBlocks
,
4
*
elementSize
,
"blockBoundingBox"
);
numNeighborPairs
=
CudaArray
::
create
<
int
>
(
cu
,
1
,
"customManyParticleNumNeighborPairs"
);
neighborStartIndex
=
CudaArray
::
create
<
int
>
(
cu
,
numParticles
+
1
,
"customManyParticleNeighborStartIndex"
);
numNeighborsForAtom
=
CudaArray
::
create
<
int
>
(
cu
,
numParticles
,
"customManyParticleNumNeighborsForAtom"
);
CHECK_RESULT
(
cuEventCreate
(
&
event
,
CU_EVENT_DISABLE_TIMING
),
"Error creating event for CustomManyParticleForce"
);
// Select a size for the array that holds the neighbor list. We have to make a fairly
// arbitrary guess, but if this turns out to be too small we'll increase it later.
maxNeighborPairs
=
150
*
numParticles
;
neighborPairs
=
CudaArray
::
create
<
int2
>
(
cu
,
maxNeighborPairs
,
"customManyParticleNeighborPairs"
);
neighbors
=
CudaArray
::
create
<
int
>
(
cu
,
maxNeighborPairs
,
"customManyParticleNeighbors"
);
}
// Now to generate the kernel. First, it needs to calculate all distances, angles,
// and dihedrals the expression depends on.
...
...
@@ -4820,15 +4861,23 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
defines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
defines
[
"M_PI"
]
=
cu
.
doubleToString
(
M_PI
);
defines
[
"CUTOFF_SQUARED"
]
=
cu
.
doubleToString
(
force
.
getCutoffDistance
()
*
force
.
getCutoffDistance
());
std
::
cout
<<
cu
.
replaceStrings
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
customManyParticle
,
replacements
)
<<
std
::
endl
;
defines
[
"TILE_SIZE"
]
=
cu
.
intToString
(
CudaContext
::
TileSize
);
defines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
// std::cout << cu.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::customManyParticle, replacements)<< std::endl;
CUmodule
module
=
cu
.
createModule
(
cu
.
replaceStrings
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
customManyParticle
,
replacements
),
defines
);
forceKernel
=
cu
.
getKernel
(
module
,
"computeInteraction"
);
blockBoundsKernel
=
cu
.
getKernel
(
module
,
"findBlockBounds"
);
neighborsKernel
=
cu
.
getKernel
(
module
,
"findNeighbors"
);
startIndicesKernel
=
cu
.
getKernel
(
module
,
"computeNeighborStartIndices"
);
copyPairsKernel
=
cu
.
getKernel
(
module
,
"copyPairsToNeighborList"
);
}
double
CudaCalcCustomManyParticleForceKernel
::
execute
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
)
{
if
(
!
hasInitializedKernel
)
{
hasInitializedKernel
=
true
;
int
index
=
0
;
// Set arguments for the force kernel.
forceArgs
.
push_back
(
&
cu
.
getForce
().
getDevicePointer
());
forceArgs
.
push_back
(
&
cu
.
getEnergyBuffer
().
getDevicePointer
());
forceArgs
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
...
...
@@ -4851,6 +4900,47 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
}
for
(
int
i
=
0
;
i
<
(
int
)
tabulatedFunctions
.
size
();
i
++
)
forceArgs
.
push_back
(
&
tabulatedFunctions
[
i
]
->
getDevicePointer
());
if
(
nonbondedMethod
!=
NoCutoff
)
{
// Set arguments for the block bounds kernel.
blockBoundsArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
blockBoundsArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
blockBoundsArgs
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
blockBoundsArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
blockBoundsArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
blockBoundsArgs
.
push_back
(
&
numNeighborPairs
->
getDevicePointer
());
// Set arguments for the neighbor list kernel.
neighborsArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
neighborsArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
neighborsArgs
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
neighborsArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
neighborsArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
neighborsArgs
.
push_back
(
&
neighborPairs
->
getDevicePointer
());
neighborsArgs
.
push_back
(
&
numNeighborPairs
->
getDevicePointer
());
neighborsArgs
.
push_back
(
&
numNeighborsForAtom
->
getDevicePointer
());
neighborsArgs
.
push_back
(
&
maxNeighborPairs
);
if
(
exclusions
!=
NULL
)
{
neighborsArgs
.
push_back
(
&
exclusions
->
getDevicePointer
());
neighborsArgs
.
push_back
(
&
exclusionStartIndex
->
getDevicePointer
());
}
// Set arguments for the kernel to find neighbor list start indices.
startIndicesArgs
.
push_back
(
&
numNeighborsForAtom
->
getDevicePointer
());
startIndicesArgs
.
push_back
(
&
neighborStartIndex
->
getDevicePointer
());
// Set arguments for the kernel to assemble the final neighbor list.
copyPairsArgs
.
push_back
(
&
neighborPairs
->
getDevicePointer
());
copyPairsArgs
.
push_back
(
&
neighbors
->
getDevicePointer
());
copyPairsArgs
.
push_back
(
&
numNeighborPairs
->
getDevicePointer
());
copyPairsArgs
.
push_back
(
&
maxNeighborPairs
);
copyPairsArgs
.
push_back
(
&
numNeighborsForAtom
->
getDevicePointer
());
copyPairsArgs
.
push_back
(
&
neighborStartIndex
->
getDevicePointer
());
}
}
if
(
globals
!=
NULL
)
{
bool
changed
=
false
;
...
...
@@ -4863,7 +4953,40 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
if
(
changed
)
globals
->
upload
(
globalParamValues
);
}
cu
.
executeKernel
(
forceKernel
,
&
forceArgs
[
0
],
cu
.
getNumAtoms
()
*
CudaContext
::
ThreadBlockSize
,
CudaContext
::
ThreadBlockSize
);
while
(
true
)
{
int
*
numPairs
=
(
int
*
)
cu
.
getPinnedBuffer
();
if
(
nonbondedMethod
!=
NoCutoff
)
{
cu
.
executeKernel
(
blockBoundsKernel
,
&
blockBoundsArgs
[
0
],
cu
.
getNumAtomBlocks
());
cu
.
executeKernel
(
neighborsKernel
,
&
neighborsArgs
[
0
],
cu
.
getNumAtoms
());
// We need to make sure there was enough memory for the neighbor list. Download the
// information asynchronously so kernels can be running at the same time.
numNeighborPairs
->
download
(
numPairs
,
false
);
CHECK_RESULT
(
cuEventRecord
(
event
,
0
),
"Error recording event for CustomManyParticleForce"
);
cu
.
executeKernel
(
startIndicesKernel
,
&
startIndicesArgs
[
0
],
256
,
256
,
256
*
sizeof
(
int
));
cu
.
executeKernel
(
copyPairsKernel
,
&
copyPairsArgs
[
0
],
maxNeighborPairs
);
}
cu
.
executeKernel
(
forceKernel
,
&
forceArgs
[
0
],
cu
.
getNumAtoms
()
*
CudaContext
::
ThreadBlockSize
,
CudaContext
::
ThreadBlockSize
);
if
(
nonbondedMethod
!=
NoCutoff
)
{
// Make sure there was enough memory for the neighbor list.
CHECK_RESULT
(
cuEventSynchronize
(
event
),
"Error synchronizing on event for CustomManyParticleForce"
);
if
(
*
numPairs
>
maxNeighborPairs
)
{
// Resize the arrays and run the calculation again.
delete
neighborPairs
;
neighborPairs
=
NULL
;
delete
neighbors
;
neighbors
=
NULL
;
maxNeighborPairs
=
(
int
)
(
1.1
*
(
*
numPairs
));
neighborPairs
=
CudaArray
::
create
<
int2
>
(
cu
,
maxNeighborPairs
,
"customManyParticleNeighborPairs"
);
neighbors
=
CudaArray
::
create
<
int
>
(
cu
,
maxNeighborPairs
,
"customManyParticleNeighbors"
);
continue
;
}
}
break
;
}
return
0.0
;
}
...
...
platforms/cuda/src/kernels/customManyParticle.cu
View file @
e3b631f6
...
...
@@ -74,6 +74,49 @@ inline __device__ bool isInteractionExcluded(int atom1, int atom2, int* __restri
return
false
;
}
#define WARP_SIZE 32
/**
* Perform a parallel prefix sum of boolean values over an array. This is done as the first stage of compacting an array.
*/
__device__
void
prefixSum
(
bool
value
,
short
*
sum
,
ushort2
*
temp
)
{
#if __CUDA_ARCH__ >= 300
const
int
indexInWarp
=
threadIdx
.
x
%
WARP_SIZE
;
const
int
warpMask
=
(
2
<<
indexInWarp
)
-
1
;
temp
[
threadIdx
.
x
].
x
=
__popc
(
__ballot
(
value
)
&
warpMask
);
__syncthreads
();
if
(
threadIdx
.
x
<
WARP_SIZE
)
{
int
multiWarpSum
=
temp
[(
threadIdx
.
x
+
1
)
*
WARP_SIZE
-
1
].
x
;
for
(
int
offset
=
1
;
offset
<
blockDim
.
x
/
WARP_SIZE
;
offset
*=
2
)
{
short
n
=
__shfl_up
(
multiWarpSum
,
offset
,
WARP_SIZE
);
if
(
indexInWarp
>=
offset
)
multiWarpSum
+=
n
;
}
temp
[
threadIdx
.
x
].
y
=
multiWarpSum
;
}
__syncthreads
();
sum
[
threadIdx
.
x
]
=
temp
[
threadIdx
.
x
].
x
+
(
threadIdx
.
x
<
WARP_SIZE
?
0
:
temp
[
threadIdx
.
x
/
WARP_SIZE
-
1
].
y
);
__syncthreads
();
#else
temp
[
threadIdx
.
x
].
x
=
value
;
__syncthreads
();
int
whichBuffer
=
0
;
for
(
int
offset
=
1
;
offset
<
blockDim
.
x
;
offset
*=
2
)
{
if
(
whichBuffer
==
0
)
temp
[
threadIdx
.
x
].
y
=
(
threadIdx
.
x
<
offset
?
temp
[
threadIdx
.
x
].
x
:
temp
[
threadIdx
.
x
].
x
+
temp
[
threadIdx
.
x
-
offset
].
x
);
else
temp
[
threadIdx
.
x
].
x
=
(
threadIdx
.
x
<
offset
?
temp
[
threadIdx
.
x
].
y
:
temp
[
threadIdx
.
x
].
y
+
temp
[
threadIdx
.
x
-
offset
].
y
);
whichBuffer
=
1
-
whichBuffer
;
__syncthreads
();
}
if
(
whichBuffer
==
0
)
sum
[
threadIdx
.
x
]
=
temp
[
threadIdx
.
x
].
x
;
else
sum
[
threadIdx
.
x
]
=
temp
[
threadIdx
.
x
].
y
;
__syncthreads
();
#endif
}
/**
* Compute the interaction.
*/
...
...
@@ -120,4 +163,162 @@ extern "C" __global__ void computeInteraction(
}
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
\ No newline at end of file
}
/**
* Find a bounding box for the atoms in each block.
*/
extern
"C"
__global__
void
findBlockBounds
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
__restrict__
posq
,
real4
*
__restrict__
blockCenter
,
real4
*
__restrict__
blockBoundingBox
,
int
*
__restrict__
numNeighborPairs
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
base
=
index
*
TILE_SIZE
;
while
(
base
<
NUM_ATOMS
)
{
real4
pos
=
posq
[
base
];
#ifdef USE_PERIODIC
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
(
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
periodicBoxSize
.
z
;
#endif
real4
minPos
=
pos
;
real4
maxPos
=
pos
;
int
last
=
min
(
base
+
TILE_SIZE
,
NUM_ATOMS
);
for
(
int
i
=
base
+
1
;
i
<
last
;
i
++
)
{
pos
=
posq
[
i
];
#ifdef USE_PERIODIC
real4
center
=
0.5
f
*
(
maxPos
+
minPos
);
pos
.
x
-=
floor
((
pos
.
x
-
center
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
((
pos
.
y
-
center
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
((
pos
.
z
-
center
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
minPos
=
make_real4
(
min
(
minPos
.
x
,
pos
.
x
),
min
(
minPos
.
y
,
pos
.
y
),
min
(
minPos
.
z
,
pos
.
z
),
0
);
maxPos
=
make_real4
(
max
(
maxPos
.
x
,
pos
.
x
),
max
(
maxPos
.
y
,
pos
.
y
),
max
(
maxPos
.
z
,
pos
.
z
),
0
);
}
real4
blockSize
=
0.5
f
*
(
maxPos
-
minPos
);
blockBoundingBox
[
index
]
=
blockSize
;
blockCenter
[
index
]
=
0.5
f
*
(
maxPos
+
minPos
);
index
+=
blockDim
.
x
*
gridDim
.
x
;
base
=
index
*
TILE_SIZE
;
}
if
(
blockIdx
.
x
==
0
&&
threadIdx
.
x
==
0
)
*
numNeighborPairs
=
0
;
}
/**
* Find a list of neighbors for each atom.
*/
extern
"C"
__global__
void
findNeighbors
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockBoundingBox
,
int2
*
__restrict__
neighborPairs
,
int
*
__restrict__
numNeighborPairs
,
int
*
__restrict__
numNeighborsForAtom
,
int
maxNeighborPairs
#ifdef USE_EXCLUSIONS
,
int
*
__restrict__
exclusions
,
int
*
__restrict__
exclusionStartIndex
#endif
)
{
for
(
int
atom1
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
atom1
<
NUM_ATOMS
;
atom1
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Load data for this atom.
real4
pos1
=
posq
[
atom1
];
int
block1
=
atom1
/
TILE_SIZE
;
real4
blockCenter1
=
blockCenter
[
block1
];
real4
blockSize1
=
blockBoundingBox
[
block1
];
int
totalNeighborsForAtom1
=
0
;
// Loop over atom blocks to search for neighbors.
for
(
int
block2
=
block1
;
block2
<
NUM_BLOCKS
;
block2
++
)
{
real4
blockCenter2
=
blockCenter
[
block2
];
real4
blockSize2
=
blockBoundingBox
[
block2
];
real4
blockDelta
=
blockCenter1
-
blockCenter2
;
#ifdef USE_PERIODIC
blockDelta
.
x
-=
floor
(
blockDelta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
blockDelta
.
y
-=
floor
(
blockDelta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
blockDelta
.
z
-=
floor
(
blockDelta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
blockDelta
.
x
=
max
(
0.0
f
,
fabs
(
blockDelta
.
x
)
-
blockSize1
.
x
-
blockSize2
.
x
);
blockDelta
.
y
=
max
(
0.0
f
,
fabs
(
blockDelta
.
y
)
-
blockSize1
.
y
-
blockSize2
.
y
);
blockDelta
.
z
=
max
(
0.0
f
,
fabs
(
blockDelta
.
z
)
-
blockSize1
.
z
-
blockSize2
.
z
);
if
(
blockDelta
.
x
*
blockDelta
.
x
+
blockDelta
.
y
*
blockDelta
.
y
+
blockDelta
.
z
*
blockDelta
.
z
<
CUTOFF_SQUARED
)
{
// Loop over atoms in this block.
int
start
=
block2
*
TILE_SIZE
;
int
end
=
(
block2
+
1
)
*
TILE_SIZE
;
int
included
[
TILE_SIZE
];
int
numIncluded
=
0
;
for
(
int
atom2
=
start
;
atom2
<
end
;
atom2
++
)
{
real4
pos2
=
posq
[
atom2
];
// Decide whether to include this atom pair in the neighbor list.
real4
atomDelta
=
delta
(
pos1
,
pos2
,
periodicBoxSize
,
invPeriodicBoxSize
);
bool
includeAtom
=
(
atom2
>
atom1
&&
atom2
<
NUM_ATOMS
&&
atomDelta
.
w
<
CUTOFF_SQUARED
);
#ifdef USE_EXCLUSIONS
if
(
includeAtom
)
includeAtom
&=
!
isInteractionExcluded
(
atom1
,
atom2
,
exclusions
,
exclusionStartIndex
);
#endif
if
(
includeAtom
)
included
[
numIncluded
++
]
=
atom2
;
}
// If we found any neighbors, store them to the neighbor list.
if
(
numIncluded
>
0
)
{
int
baseIndex
=
atomicAdd
(
numNeighborPairs
,
numIncluded
);
if
(
baseIndex
+
numIncluded
<=
maxNeighborPairs
)
for
(
int
i
=
0
;
i
<
numIncluded
;
i
++
)
neighborPairs
[
baseIndex
+
i
]
=
make_int2
(
atom1
,
included
[
i
]);
totalNeighborsForAtom1
+=
numIncluded
;
}
}
}
numNeighborsForAtom
[
atom1
]
=
totalNeighborsForAtom1
;
}
}
/**
* Sum the neighbor counts to compute the start position of each atom. This kernel
* is executed as a single work group.
*/
extern
"C"
__global__
void
computeNeighborStartIndices
(
int
*
__restrict__
numNeighborsForAtom
,
int
*
__restrict__
neighborStartIndex
)
{
extern
__shared__
unsigned
int
posBuffer
[];
unsigned
int
globalOffset
=
0
;
for
(
unsigned
int
startAtom
=
0
;
startAtom
<
NUM_ATOMS
;
startAtom
+=
blockDim
.
x
)
{
// Load the neighbor counts into local memory.
unsigned
int
globalIndex
=
startAtom
+
threadIdx
.
x
;
posBuffer
[
threadIdx
.
x
]
=
(
globalIndex
<
NUM_ATOMS
?
numNeighborsForAtom
[
globalIndex
]
:
0
);
__syncthreads
();
// Perform a parallel prefix sum.
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
unsigned
int
add
=
(
threadIdx
.
x
>=
step
?
posBuffer
[
threadIdx
.
x
-
step
]
:
0
);
__syncthreads
();
posBuffer
[
threadIdx
.
x
]
+=
add
;
__syncthreads
();
}
// Write the results back to global memory.
if
(
globalIndex
<
NUM_ATOMS
)
neighborStartIndex
[
globalIndex
+
1
]
=
posBuffer
[
threadIdx
.
x
]
+
globalOffset
;
numNeighborsForAtom
[
globalIndex
]
=
0
;
// Clear this so the next kernel can use it as a counter
globalOffset
+=
posBuffer
[
blockDim
.
x
-
1
];
}
if
(
threadIdx
.
x
==
0
)
neighborStartIndex
[
0
]
=
0
;
}
/**
* Assemble the final neighbor list.
*/
extern
"C"
__global__
void
copyPairsToNeighborList
(
const
int2
*
__restrict__
neighborPairs
,
int
*
__restrict__
neighbors
,
int
*
__restrict__
numNeighborPairs
,
int
maxNeighborPairs
,
int
*
__restrict__
numNeighborsForAtom
,
const
int
*
__restrict__
neighborStartIndex
)
{
int
actualPairs
=
*
numNeighborPairs
;
if
(
actualPairs
>
maxNeighborPairs
)
return
;
// There wasn't enough memory for the neighbor list, so we'll need to rebuild it.
for
(
unsigned
int
index
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
index
<
actualPairs
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int2
pair
=
neighborPairs
[
index
];
int
startIndex
=
neighborStartIndex
[
pair
.
x
];
int
offset
=
atomicAdd
(
numNeighborsForAtom
+
pair
.
x
,
1
);
neighbors
[
startIndex
+
offset
]
=
pair
.
y
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment