Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
e3b631f6
"platforms/opencl/tests/TestOpenCLPythonForce.cpp" did not exist on "05472c9a812927c863be67abbb3376e944b2c7ef"
Commit
e3b631f6
authored
Aug 20, 2014
by
peastman
Browse files
Wrote code to build neighbor list for CUDA version of CustomManyParticleForce
parent
786e1ee7
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
344 additions
and
10 deletions
+344
-10
openmmapi/include/openmm/CustomManyParticleForce.h
openmmapi/include/openmm/CustomManyParticleForce.h
+2
-2
openmmapi/src/CustomManyParticleForce.cpp
openmmapi/src/CustomManyParticleForce.cpp
+1
-1
platforms/cuda/include/CudaKernels.h
platforms/cuda/include/CudaKernels.h
+13
-3
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+126
-3
platforms/cuda/src/kernels/customManyParticle.cu
platforms/cuda/src/kernels/customManyParticle.cu
+202
-1
No files found.
openmmapi/include/openmm/CustomManyParticleForce.h
View file @
e3b631f6
...
@@ -85,11 +85,11 @@ namespace OpenMM {
...
@@ -85,11 +85,11 @@ namespace OpenMM {
* As an example, the following code creates a CustomManyParticleForce that implements an Axilrod-Teller potential. This
* As an example, the following code creates a CustomManyParticleForce that implements an Axilrod-Teller potential. This
* is an interaction between three particles that depends on all three distances and angles formed by the particles.
* is an interaction between three particles that depends on all three distances and angles formed by the particles.
*
*
* <tt>CustomManyParticleForce* force = new CustomManyParticleForce(3,
* <tt>
<pre>
CustomManyParticleForce* force = new CustomManyParticleForce(3,
* "C*(1+3*cos(theta1)*cos(theta2)*cos(theta3))/(r12*r13*r23)^3;"
* "C*(1+3*cos(theta1)*cos(theta2)*cos(theta3))/(r12*r13*r23)^3;"
* "theta1=angle(p1,p2,p3); theta2=angle(p2,p3,p1); theta3=angle(p3,p1,p2);"
* "theta1=angle(p1,p2,p3); theta2=angle(p2,p3,p1); theta3=angle(p3,p1,p2);"
* "r12=distance(p1,p2); r13=distance(p1,p3); r23=distance(p2,p3)");
* "r12=distance(p1,p2); r13=distance(p1,p3); r23=distance(p2,p3)");
* </tt>
*
<pre>
</tt>
*
*
* This force depends on one parameter, C. The following code defines it as a global parameter:
* This force depends on one parameter, C. The following code defines it as a global parameter:
*
*
...
...
openmmapi/src/CustomManyParticleForce.cpp
View file @
e3b631f6
...
@@ -44,7 +44,7 @@ using namespace OpenMM;
...
@@ -44,7 +44,7 @@ using namespace OpenMM;
using
namespace
std
;
using
namespace
std
;
CustomManyParticleForce
::
CustomManyParticleForce
(
int
particlesPerSet
,
const
string
&
energy
)
:
CustomManyParticleForce
::
CustomManyParticleForce
(
int
particlesPerSet
,
const
string
&
energy
)
:
particlesPerSet
(
particlesPerSet
),
energyExpression
(
energy
),
nonbondedMethod
(
NoCutoff
),
typeFilters
(
particlesPerSet
)
{
particlesPerSet
(
particlesPerSet
),
energyExpression
(
energy
),
nonbondedMethod
(
NoCutoff
),
cutoffDistance
(
1.0
),
typeFilters
(
particlesPerSet
)
{
}
}
CustomManyParticleForce
::~
CustomManyParticleForce
()
{
CustomManyParticleForce
::~
CustomManyParticleForce
()
{
...
...
platforms/cuda/include/CudaKernels.h
View file @
e3b631f6
...
@@ -932,7 +932,8 @@ class CudaCalcCustomManyParticleForceKernel : public CalcCustomManyParticleForce
...
@@ -932,7 +932,8 @@ class CudaCalcCustomManyParticleForceKernel : public CalcCustomManyParticleForce
public:
public:
CudaCalcCustomManyParticleForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
const
System
&
system
)
:
CalcCustomManyParticleForceKernel
(
name
,
platform
),
CudaCalcCustomManyParticleForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
const
System
&
system
)
:
CalcCustomManyParticleForceKernel
(
name
,
platform
),
hasInitializedKernel
(
false
),
cu
(
cu
),
params
(
NULL
),
globals
(
NULL
),
particleTypes
(
NULL
),
orderIndex
(
NULL
),
particleOrder
(
NULL
),
exclusions
(
NULL
),
hasInitializedKernel
(
false
),
cu
(
cu
),
params
(
NULL
),
globals
(
NULL
),
particleTypes
(
NULL
),
orderIndex
(
NULL
),
particleOrder
(
NULL
),
exclusions
(
NULL
),
exclusionStartIndex
(
NULL
),
system
(
system
)
{
exclusionStartIndex
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
neighborPairs
(
NULL
),
numNeighborPairs
(
NULL
),
neighborStartIndex
(
NULL
),
numNeighborsForAtom
(
NULL
),
neighbors
(
NULL
),
system
(
system
)
{
}
}
~
CudaCalcCustomManyParticleForceKernel
();
~
CudaCalcCustomManyParticleForceKernel
();
/**
/**
...
@@ -963,6 +964,7 @@ private:
...
@@ -963,6 +964,7 @@ private:
CudaContext
&
cu
;
CudaContext
&
cu
;
bool
hasInitializedKernel
;
bool
hasInitializedKernel
;
NonbondedMethod
nonbondedMethod
;
NonbondedMethod
nonbondedMethod
;
int
maxNeighborPairs
;
CudaParameterSet
*
params
;
CudaParameterSet
*
params
;
CudaArray
*
globals
;
CudaArray
*
globals
;
CudaArray
*
particleTypes
;
CudaArray
*
particleTypes
;
...
@@ -970,12 +972,20 @@ private:
...
@@ -970,12 +972,20 @@ private:
CudaArray
*
particleOrder
;
CudaArray
*
particleOrder
;
CudaArray
*
exclusions
;
CudaArray
*
exclusions
;
CudaArray
*
exclusionStartIndex
;
CudaArray
*
exclusionStartIndex
;
CudaArray
*
blockCenter
;
CudaArray
*
blockBoundingBox
;
CudaArray
*
neighborPairs
;
CudaArray
*
numNeighborPairs
;
CudaArray
*
neighborStartIndex
;
CudaArray
*
numNeighborsForAtom
;
CudaArray
*
neighbors
;
std
::
vector
<
std
::
string
>
globalParamNames
;
std
::
vector
<
std
::
string
>
globalParamNames
;
std
::
vector
<
float
>
globalParamValues
;
std
::
vector
<
float
>
globalParamValues
;
std
::
vector
<
CudaArray
*>
tabulatedFunctions
;
std
::
vector
<
CudaArray
*>
tabulatedFunctions
;
std
::
vector
<
void
*>
forceArgs
;
std
::
vector
<
void
*>
forceArgs
,
blockBoundsArgs
,
neighborsArgs
,
startIndicesArgs
,
copyPairsArgs
;
const
System
&
system
;
const
System
&
system
;
CUfunction
forceKernel
;
CUfunction
forceKernel
,
blockBoundsKernel
,
neighborsKernel
,
startIndicesKernel
,
copyPairsKernel
;
CUevent
event
;
};
};
/**
/**
...
...
platforms/cuda/src/CudaKernels.cpp
View file @
e3b631f6
...
@@ -56,6 +56,13 @@ using namespace std;
...
@@ -56,6 +56,13 @@ using namespace std;
using
Lepton
::
ExpressionTreeNode
;
using
Lepton
::
ExpressionTreeNode
;
using
Lepton
::
Operation
;
using
Lepton
::
Operation
;
#define CHECK_RESULT(result, prefix) \
if (result != CUDA_SUCCESS) { \
std::stringstream m; \
m<<prefix<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
static
bool
isZeroExpression
(
const
Lepton
::
ParsedExpression
&
expression
)
{
static
bool
isZeroExpression
(
const
Lepton
::
ParsedExpression
&
expression
)
{
const
Lepton
::
Operation
&
op
=
expression
.
getRootNode
().
getOperation
();
const
Lepton
::
Operation
&
op
=
expression
.
getRootNode
().
getOperation
();
if
(
op
.
getId
()
!=
Lepton
::
Operation
::
CONSTANT
)
if
(
op
.
getId
()
!=
Lepton
::
Operation
::
CONSTANT
)
...
@@ -4445,6 +4452,20 @@ CudaCalcCustomManyParticleForceKernel::~CudaCalcCustomManyParticleForceKernel()
...
@@ -4445,6 +4452,20 @@ CudaCalcCustomManyParticleForceKernel::~CudaCalcCustomManyParticleForceKernel()
delete
exclusions
;
delete
exclusions
;
if
(
exclusionStartIndex
!=
NULL
)
if
(
exclusionStartIndex
!=
NULL
)
delete
exclusionStartIndex
;
delete
exclusionStartIndex
;
if
(
blockCenter
!=
NULL
)
delete
blockCenter
;
if
(
blockBoundingBox
!=
NULL
)
delete
blockBoundingBox
;
if
(
neighborPairs
!=
NULL
)
delete
neighborPairs
;
if
(
numNeighborPairs
!=
NULL
)
delete
numNeighborPairs
;
if
(
neighborStartIndex
!=
NULL
)
delete
neighborStartIndex
;
if
(
neighbors
!=
NULL
)
delete
neighbors
;
if
(
numNeighborsForAtom
!=
NULL
)
delete
numNeighborsForAtom
;
for
(
int
i
=
0
;
i
<
(
int
)
tabulatedFunctions
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
tabulatedFunctions
.
size
();
i
++
)
delete
tabulatedFunctions
[
i
];
delete
tabulatedFunctions
[
i
];
}
}
...
@@ -4570,6 +4591,26 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
...
@@ -4570,6 +4591,26 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
exclusions
->
upload
(
exclusionsVec
);
exclusions
->
upload
(
exclusionsVec
);
exclusionStartIndex
->
upload
(
exclusionStartIndexVec
);
exclusionStartIndex
->
upload
(
exclusionStartIndexVec
);
}
}
// Build data structures for the neighbor list.
if
(
nonbondedMethod
!=
NoCutoff
)
{
int
numAtomBlocks
=
cu
.
getNumAtomBlocks
();
int
elementSize
=
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
blockCenter
=
new
CudaArray
(
cu
,
numAtomBlocks
,
4
*
elementSize
,
"blockCenter"
);
blockBoundingBox
=
new
CudaArray
(
cu
,
numAtomBlocks
,
4
*
elementSize
,
"blockBoundingBox"
);
numNeighborPairs
=
CudaArray
::
create
<
int
>
(
cu
,
1
,
"customManyParticleNumNeighborPairs"
);
neighborStartIndex
=
CudaArray
::
create
<
int
>
(
cu
,
numParticles
+
1
,
"customManyParticleNeighborStartIndex"
);
numNeighborsForAtom
=
CudaArray
::
create
<
int
>
(
cu
,
numParticles
,
"customManyParticleNumNeighborsForAtom"
);
CHECK_RESULT
(
cuEventCreate
(
&
event
,
CU_EVENT_DISABLE_TIMING
),
"Error creating event for CustomManyParticleForce"
);
// Select a size for the array that holds the neighbor list. We have to make a fairly
// arbitrary guess, but if this turns out to be too small we'll increase it later.
maxNeighborPairs
=
150
*
numParticles
;
neighborPairs
=
CudaArray
::
create
<
int2
>
(
cu
,
maxNeighborPairs
,
"customManyParticleNeighborPairs"
);
neighbors
=
CudaArray
::
create
<
int
>
(
cu
,
maxNeighborPairs
,
"customManyParticleNeighbors"
);
}
// Now to generate the kernel. First, it needs to calculate all distances, angles,
// Now to generate the kernel. First, it needs to calculate all distances, angles,
// and dihedrals the expression depends on.
// and dihedrals the expression depends on.
...
@@ -4820,15 +4861,23 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
...
@@ -4820,15 +4861,23 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
defines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
defines
[
"M_PI"
]
=
cu
.
doubleToString
(
M_PI
);
defines
[
"M_PI"
]
=
cu
.
doubleToString
(
M_PI
);
defines
[
"CUTOFF_SQUARED"
]
=
cu
.
doubleToString
(
force
.
getCutoffDistance
()
*
force
.
getCutoffDistance
());
defines
[
"CUTOFF_SQUARED"
]
=
cu
.
doubleToString
(
force
.
getCutoffDistance
()
*
force
.
getCutoffDistance
());
std
::
cout
<<
cu
.
replaceStrings
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
customManyParticle
,
replacements
)
<<
std
::
endl
;
defines
[
"TILE_SIZE"
]
=
cu
.
intToString
(
CudaContext
::
TileSize
);
defines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
// std::cout << cu.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::customManyParticle, replacements)<< std::endl;
CUmodule
module
=
cu
.
createModule
(
cu
.
replaceStrings
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
customManyParticle
,
replacements
),
defines
);
CUmodule
module
=
cu
.
createModule
(
cu
.
replaceStrings
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
customManyParticle
,
replacements
),
defines
);
forceKernel
=
cu
.
getKernel
(
module
,
"computeInteraction"
);
forceKernel
=
cu
.
getKernel
(
module
,
"computeInteraction"
);
blockBoundsKernel
=
cu
.
getKernel
(
module
,
"findBlockBounds"
);
neighborsKernel
=
cu
.
getKernel
(
module
,
"findNeighbors"
);
startIndicesKernel
=
cu
.
getKernel
(
module
,
"computeNeighborStartIndices"
);
copyPairsKernel
=
cu
.
getKernel
(
module
,
"copyPairsToNeighborList"
);
}
}
double
CudaCalcCustomManyParticleForceKernel
::
execute
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
)
{
double
CudaCalcCustomManyParticleForceKernel
::
execute
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
)
{
if
(
!
hasInitializedKernel
)
{
if
(
!
hasInitializedKernel
)
{
hasInitializedKernel
=
true
;
hasInitializedKernel
=
true
;
int
index
=
0
;
// Set arguments for the force kernel.
forceArgs
.
push_back
(
&
cu
.
getForce
().
getDevicePointer
());
forceArgs
.
push_back
(
&
cu
.
getForce
().
getDevicePointer
());
forceArgs
.
push_back
(
&
cu
.
getEnergyBuffer
().
getDevicePointer
());
forceArgs
.
push_back
(
&
cu
.
getEnergyBuffer
().
getDevicePointer
());
forceArgs
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
forceArgs
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
...
@@ -4851,6 +4900,47 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
...
@@ -4851,6 +4900,47 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
}
}
for
(
int
i
=
0
;
i
<
(
int
)
tabulatedFunctions
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
tabulatedFunctions
.
size
();
i
++
)
forceArgs
.
push_back
(
&
tabulatedFunctions
[
i
]
->
getDevicePointer
());
forceArgs
.
push_back
(
&
tabulatedFunctions
[
i
]
->
getDevicePointer
());
if
(
nonbondedMethod
!=
NoCutoff
)
{
// Set arguments for the block bounds kernel.
blockBoundsArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
blockBoundsArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
blockBoundsArgs
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
blockBoundsArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
blockBoundsArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
blockBoundsArgs
.
push_back
(
&
numNeighborPairs
->
getDevicePointer
());
// Set arguments for the neighbor list kernel.
neighborsArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
neighborsArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
neighborsArgs
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
neighborsArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
neighborsArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
neighborsArgs
.
push_back
(
&
neighborPairs
->
getDevicePointer
());
neighborsArgs
.
push_back
(
&
numNeighborPairs
->
getDevicePointer
());
neighborsArgs
.
push_back
(
&
numNeighborsForAtom
->
getDevicePointer
());
neighborsArgs
.
push_back
(
&
maxNeighborPairs
);
if
(
exclusions
!=
NULL
)
{
neighborsArgs
.
push_back
(
&
exclusions
->
getDevicePointer
());
neighborsArgs
.
push_back
(
&
exclusionStartIndex
->
getDevicePointer
());
}
// Set arguments for the kernel to find neighbor list start indices.
startIndicesArgs
.
push_back
(
&
numNeighborsForAtom
->
getDevicePointer
());
startIndicesArgs
.
push_back
(
&
neighborStartIndex
->
getDevicePointer
());
// Set arguments for the kernel to assemble the final neighbor list.
copyPairsArgs
.
push_back
(
&
neighborPairs
->
getDevicePointer
());
copyPairsArgs
.
push_back
(
&
neighbors
->
getDevicePointer
());
copyPairsArgs
.
push_back
(
&
numNeighborPairs
->
getDevicePointer
());
copyPairsArgs
.
push_back
(
&
maxNeighborPairs
);
copyPairsArgs
.
push_back
(
&
numNeighborsForAtom
->
getDevicePointer
());
copyPairsArgs
.
push_back
(
&
neighborStartIndex
->
getDevicePointer
());
}
}
}
if
(
globals
!=
NULL
)
{
if
(
globals
!=
NULL
)
{
bool
changed
=
false
;
bool
changed
=
false
;
...
@@ -4863,7 +4953,40 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
...
@@ -4863,7 +4953,40 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
if
(
changed
)
if
(
changed
)
globals
->
upload
(
globalParamValues
);
globals
->
upload
(
globalParamValues
);
}
}
cu
.
executeKernel
(
forceKernel
,
&
forceArgs
[
0
],
cu
.
getNumAtoms
()
*
CudaContext
::
ThreadBlockSize
,
CudaContext
::
ThreadBlockSize
);
while
(
true
)
{
int
*
numPairs
=
(
int
*
)
cu
.
getPinnedBuffer
();
if
(
nonbondedMethod
!=
NoCutoff
)
{
cu
.
executeKernel
(
blockBoundsKernel
,
&
blockBoundsArgs
[
0
],
cu
.
getNumAtomBlocks
());
cu
.
executeKernel
(
neighborsKernel
,
&
neighborsArgs
[
0
],
cu
.
getNumAtoms
());
// We need to make sure there was enough memory for the neighbor list. Download the
// information asynchronously so kernels can be running at the same time.
numNeighborPairs
->
download
(
numPairs
,
false
);
CHECK_RESULT
(
cuEventRecord
(
event
,
0
),
"Error recording event for CustomManyParticleForce"
);
cu
.
executeKernel
(
startIndicesKernel
,
&
startIndicesArgs
[
0
],
256
,
256
,
256
*
sizeof
(
int
));
cu
.
executeKernel
(
copyPairsKernel
,
&
copyPairsArgs
[
0
],
maxNeighborPairs
);
}
cu
.
executeKernel
(
forceKernel
,
&
forceArgs
[
0
],
cu
.
getNumAtoms
()
*
CudaContext
::
ThreadBlockSize
,
CudaContext
::
ThreadBlockSize
);
if
(
nonbondedMethod
!=
NoCutoff
)
{
// Make sure there was enough memory for the neighbor list.
CHECK_RESULT
(
cuEventSynchronize
(
event
),
"Error synchronizing on event for CustomManyParticleForce"
);
if
(
*
numPairs
>
maxNeighborPairs
)
{
// Resize the arrays and run the calculation again.
delete
neighborPairs
;
neighborPairs
=
NULL
;
delete
neighbors
;
neighbors
=
NULL
;
maxNeighborPairs
=
(
int
)
(
1.1
*
(
*
numPairs
));
neighborPairs
=
CudaArray
::
create
<
int2
>
(
cu
,
maxNeighborPairs
,
"customManyParticleNeighborPairs"
);
neighbors
=
CudaArray
::
create
<
int
>
(
cu
,
maxNeighborPairs
,
"customManyParticleNeighbors"
);
continue
;
}
}
break
;
}
return
0.0
;
return
0.0
;
}
}
...
...
platforms/cuda/src/kernels/customManyParticle.cu
View file @
e3b631f6
...
@@ -74,6 +74,49 @@ inline __device__ bool isInteractionExcluded(int atom1, int atom2, int* __restri
...
@@ -74,6 +74,49 @@ inline __device__ bool isInteractionExcluded(int atom1, int atom2, int* __restri
return
false
;
return
false
;
}
}
#define WARP_SIZE 32
/**
* Perform a parallel prefix sum of boolean values over an array. This is done as the first stage of compacting an array.
*/
__device__
void
prefixSum
(
bool
value
,
short
*
sum
,
ushort2
*
temp
)
{
#if __CUDA_ARCH__ >= 300
const
int
indexInWarp
=
threadIdx
.
x
%
WARP_SIZE
;
const
int
warpMask
=
(
2
<<
indexInWarp
)
-
1
;
temp
[
threadIdx
.
x
].
x
=
__popc
(
__ballot
(
value
)
&
warpMask
);
__syncthreads
();
if
(
threadIdx
.
x
<
WARP_SIZE
)
{
int
multiWarpSum
=
temp
[(
threadIdx
.
x
+
1
)
*
WARP_SIZE
-
1
].
x
;
for
(
int
offset
=
1
;
offset
<
blockDim
.
x
/
WARP_SIZE
;
offset
*=
2
)
{
short
n
=
__shfl_up
(
multiWarpSum
,
offset
,
WARP_SIZE
);
if
(
indexInWarp
>=
offset
)
multiWarpSum
+=
n
;
}
temp
[
threadIdx
.
x
].
y
=
multiWarpSum
;
}
__syncthreads
();
sum
[
threadIdx
.
x
]
=
temp
[
threadIdx
.
x
].
x
+
(
threadIdx
.
x
<
WARP_SIZE
?
0
:
temp
[
threadIdx
.
x
/
WARP_SIZE
-
1
].
y
);
__syncthreads
();
#else
temp
[
threadIdx
.
x
].
x
=
value
;
__syncthreads
();
int
whichBuffer
=
0
;
for
(
int
offset
=
1
;
offset
<
blockDim
.
x
;
offset
*=
2
)
{
if
(
whichBuffer
==
0
)
temp
[
threadIdx
.
x
].
y
=
(
threadIdx
.
x
<
offset
?
temp
[
threadIdx
.
x
].
x
:
temp
[
threadIdx
.
x
].
x
+
temp
[
threadIdx
.
x
-
offset
].
x
);
else
temp
[
threadIdx
.
x
].
x
=
(
threadIdx
.
x
<
offset
?
temp
[
threadIdx
.
x
].
y
:
temp
[
threadIdx
.
x
].
y
+
temp
[
threadIdx
.
x
-
offset
].
y
);
whichBuffer
=
1
-
whichBuffer
;
__syncthreads
();
}
if
(
whichBuffer
==
0
)
sum
[
threadIdx
.
x
]
=
temp
[
threadIdx
.
x
].
x
;
else
sum
[
threadIdx
.
x
]
=
temp
[
threadIdx
.
x
].
y
;
__syncthreads
();
#endif
}
/**
/**
* Compute the interaction.
* Compute the interaction.
*/
*/
...
@@ -120,4 +163,162 @@ extern "C" __global__ void computeInteraction(
...
@@ -120,4 +163,162 @@ extern "C" __global__ void computeInteraction(
}
}
}
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
}
\ No newline at end of file
/**
* Find a bounding box for the atoms in each block.
*/
extern
"C"
__global__
void
findBlockBounds
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
__restrict__
posq
,
real4
*
__restrict__
blockCenter
,
real4
*
__restrict__
blockBoundingBox
,
int
*
__restrict__
numNeighborPairs
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
base
=
index
*
TILE_SIZE
;
while
(
base
<
NUM_ATOMS
)
{
real4
pos
=
posq
[
base
];
#ifdef USE_PERIODIC
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
(
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
periodicBoxSize
.
z
;
#endif
real4
minPos
=
pos
;
real4
maxPos
=
pos
;
int
last
=
min
(
base
+
TILE_SIZE
,
NUM_ATOMS
);
for
(
int
i
=
base
+
1
;
i
<
last
;
i
++
)
{
pos
=
posq
[
i
];
#ifdef USE_PERIODIC
real4
center
=
0.5
f
*
(
maxPos
+
minPos
);
pos
.
x
-=
floor
((
pos
.
x
-
center
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
((
pos
.
y
-
center
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
((
pos
.
z
-
center
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
minPos
=
make_real4
(
min
(
minPos
.
x
,
pos
.
x
),
min
(
minPos
.
y
,
pos
.
y
),
min
(
minPos
.
z
,
pos
.
z
),
0
);
maxPos
=
make_real4
(
max
(
maxPos
.
x
,
pos
.
x
),
max
(
maxPos
.
y
,
pos
.
y
),
max
(
maxPos
.
z
,
pos
.
z
),
0
);
}
real4
blockSize
=
0.5
f
*
(
maxPos
-
minPos
);
blockBoundingBox
[
index
]
=
blockSize
;
blockCenter
[
index
]
=
0.5
f
*
(
maxPos
+
minPos
);
index
+=
blockDim
.
x
*
gridDim
.
x
;
base
=
index
*
TILE_SIZE
;
}
if
(
blockIdx
.
x
==
0
&&
threadIdx
.
x
==
0
)
*
numNeighborPairs
=
0
;
}
/**
* Find a list of neighbors for each atom.
*/
extern
"C"
__global__
void
findNeighbors
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockBoundingBox
,
int2
*
__restrict__
neighborPairs
,
int
*
__restrict__
numNeighborPairs
,
int
*
__restrict__
numNeighborsForAtom
,
int
maxNeighborPairs
#ifdef USE_EXCLUSIONS
,
int
*
__restrict__
exclusions
,
int
*
__restrict__
exclusionStartIndex
#endif
)
{
for
(
int
atom1
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
atom1
<
NUM_ATOMS
;
atom1
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Load data for this atom.
real4
pos1
=
posq
[
atom1
];
int
block1
=
atom1
/
TILE_SIZE
;
real4
blockCenter1
=
blockCenter
[
block1
];
real4
blockSize1
=
blockBoundingBox
[
block1
];
int
totalNeighborsForAtom1
=
0
;
// Loop over atom blocks to search for neighbors.
for
(
int
block2
=
block1
;
block2
<
NUM_BLOCKS
;
block2
++
)
{
real4
blockCenter2
=
blockCenter
[
block2
];
real4
blockSize2
=
blockBoundingBox
[
block2
];
real4
blockDelta
=
blockCenter1
-
blockCenter2
;
#ifdef USE_PERIODIC
blockDelta
.
x
-=
floor
(
blockDelta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
blockDelta
.
y
-=
floor
(
blockDelta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
blockDelta
.
z
-=
floor
(
blockDelta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
blockDelta
.
x
=
max
(
0.0
f
,
fabs
(
blockDelta
.
x
)
-
blockSize1
.
x
-
blockSize2
.
x
);
blockDelta
.
y
=
max
(
0.0
f
,
fabs
(
blockDelta
.
y
)
-
blockSize1
.
y
-
blockSize2
.
y
);
blockDelta
.
z
=
max
(
0.0
f
,
fabs
(
blockDelta
.
z
)
-
blockSize1
.
z
-
blockSize2
.
z
);
if
(
blockDelta
.
x
*
blockDelta
.
x
+
blockDelta
.
y
*
blockDelta
.
y
+
blockDelta
.
z
*
blockDelta
.
z
<
CUTOFF_SQUARED
)
{
// Loop over atoms in this block.
int
start
=
block2
*
TILE_SIZE
;
int
end
=
(
block2
+
1
)
*
TILE_SIZE
;
int
included
[
TILE_SIZE
];
int
numIncluded
=
0
;
for
(
int
atom2
=
start
;
atom2
<
end
;
atom2
++
)
{
real4
pos2
=
posq
[
atom2
];
// Decide whether to include this atom pair in the neighbor list.
real4
atomDelta
=
delta
(
pos1
,
pos2
,
periodicBoxSize
,
invPeriodicBoxSize
);
bool
includeAtom
=
(
atom2
>
atom1
&&
atom2
<
NUM_ATOMS
&&
atomDelta
.
w
<
CUTOFF_SQUARED
);
#ifdef USE_EXCLUSIONS
if
(
includeAtom
)
includeAtom
&=
!
isInteractionExcluded
(
atom1
,
atom2
,
exclusions
,
exclusionStartIndex
);
#endif
if
(
includeAtom
)
included
[
numIncluded
++
]
=
atom2
;
}
// If we found any neighbors, store them to the neighbor list.
if
(
numIncluded
>
0
)
{
int
baseIndex
=
atomicAdd
(
numNeighborPairs
,
numIncluded
);
if
(
baseIndex
+
numIncluded
<=
maxNeighborPairs
)
for
(
int
i
=
0
;
i
<
numIncluded
;
i
++
)
neighborPairs
[
baseIndex
+
i
]
=
make_int2
(
atom1
,
included
[
i
]);
totalNeighborsForAtom1
+=
numIncluded
;
}
}
}
numNeighborsForAtom
[
atom1
]
=
totalNeighborsForAtom1
;
}
}
/**
* Sum the neighbor counts to compute the start position of each atom. This kernel
* is executed as a single work group.
*/
extern
"C"
__global__
void
computeNeighborStartIndices
(
int
*
__restrict__
numNeighborsForAtom
,
int
*
__restrict__
neighborStartIndex
)
{
extern
__shared__
unsigned
int
posBuffer
[];
unsigned
int
globalOffset
=
0
;
for
(
unsigned
int
startAtom
=
0
;
startAtom
<
NUM_ATOMS
;
startAtom
+=
blockDim
.
x
)
{
// Load the neighbor counts into local memory.
unsigned
int
globalIndex
=
startAtom
+
threadIdx
.
x
;
posBuffer
[
threadIdx
.
x
]
=
(
globalIndex
<
NUM_ATOMS
?
numNeighborsForAtom
[
globalIndex
]
:
0
);
__syncthreads
();
// Perform a parallel prefix sum.
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
unsigned
int
add
=
(
threadIdx
.
x
>=
step
?
posBuffer
[
threadIdx
.
x
-
step
]
:
0
);
__syncthreads
();
posBuffer
[
threadIdx
.
x
]
+=
add
;
__syncthreads
();
}
// Write the results back to global memory.
if
(
globalIndex
<
NUM_ATOMS
)
neighborStartIndex
[
globalIndex
+
1
]
=
posBuffer
[
threadIdx
.
x
]
+
globalOffset
;
numNeighborsForAtom
[
globalIndex
]
=
0
;
// Clear this so the next kernel can use it as a counter
globalOffset
+=
posBuffer
[
blockDim
.
x
-
1
];
}
if
(
threadIdx
.
x
==
0
)
neighborStartIndex
[
0
]
=
0
;
}
/**
* Assemble the final neighbor list.
*/
extern
"C"
__global__
void
copyPairsToNeighborList
(
const
int2
*
__restrict__
neighborPairs
,
int
*
__restrict__
neighbors
,
int
*
__restrict__
numNeighborPairs
,
int
maxNeighborPairs
,
int
*
__restrict__
numNeighborsForAtom
,
const
int
*
__restrict__
neighborStartIndex
)
{
int
actualPairs
=
*
numNeighborPairs
;
if
(
actualPairs
>
maxNeighborPairs
)
return
;
// There wasn't enough memory for the neighbor list, so we'll need to rebuild it.
for
(
unsigned
int
index
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
index
<
actualPairs
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int2
pair
=
neighborPairs
[
index
];
int
startIndex
=
neighborStartIndex
[
pair
.
x
];
int
offset
=
atomicAdd
(
numNeighborsForAtom
+
pair
.
x
,
1
);
neighbors
[
startIndex
+
offset
]
=
pair
.
y
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment