Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
bddaf4e7
Commit
bddaf4e7
authored
Aug 21, 2014
by
peastman
Browse files
CUDA version of CustomManyParticleForce uses neighbor list
parent
e3b631f6
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
22 additions
and
50 deletions
+22
-50
platforms/cuda/include/CudaKernels.h
platforms/cuda/include/CudaKernels.h
+1
-1
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+13
-6
platforms/cuda/src/kernels/customManyParticle.cu
platforms/cuda/src/kernels/customManyParticle.cu
+8
-43
No files found.
platforms/cuda/include/CudaKernels.h
View file @
bddaf4e7
...
...
@@ -964,7 +964,7 @@ private:
CudaContext
&
cu
;
bool
hasInitializedKernel
;
NonbondedMethod
nonbondedMethod
;
int
maxNeighborPairs
;
int
maxNeighborPairs
,
forceWorkgroupSize
;
CudaParameterSet
*
params
;
CudaArray
*
globals
;
CudaArray
*
particleTypes
;
...
...
platforms/cuda/src/CudaKernels.cpp
View file @
bddaf4e7
...
...
@@ -4475,6 +4475,7 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
int numParticles = force.getNumParticles();
int particlesPerSet = force.getNumParticlesPerSet();
nonbondedMethod = CalcCustomManyParticleForceKernel::NonbondedMethod(force.getNonbondedMethod());
forceWorkgroupSize = 128;
// Record parameter values.
...
...
@@ -4804,19 +4805,21 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
if (i > 1)
numCombinations<<"*";
numCombinations<<"numNeighbors";
if (nonbondedMethod == NoCutoff)
atomsForCombination<<"int p"<<(i+1)<<" = p1+1+tempIndex%numNeighbors;\n";
else
atomsForCombination<<"int p"<<(i+1)<<" = neighbors[firstNeighbor+tempIndex%numNeighbors];\n";
atomsForCombination<<"tempIndex /= numNeighbors;\n";
}
if (nonbondedMethod != NoCutoff) {
int startCheckFrom = 0;
for (int i = startCheckFrom; i < particlesPerSet; i++)
for (int i = 1; i < particlesPerSet; i++)
verifyCutoff<<"real4 pos"<<(i+1)<<" = posq[p"<<(i+1)<<"];\n";
for (int i =
startCheckFrom
; i < particlesPerSet; i++)
for (int i =
1
; i < particlesPerSet; i++)
for (int j = i+1; j < particlesPerSet; j++)
verifyCutoff<<"includeInteraction &= (delta(pos"<<(i+1)<<", pos"<<(j+1)<<", periodicBoxSize, invPeriodicBoxSize).w < CUTOFF_SQUARED);\n";
}
if (force.getNumExclusions() > 0) {
int startCheckFrom =
0
;
int startCheckFrom =
(nonbondedMethod == NoCutoff ? 0 : 1)
;
for (int i = startCheckFrom; i < particlesPerSet; i++)
for (int j = i+1; j < particlesPerSet; j++)
verifyExclusions<<"includeInteraction &= !isInteractionExcluded(p"<<(i+1)<<", p"<<(j+1)<<", exclusions, exclusionStartIndex);\n";
...
...
@@ -4883,6 +4886,10 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
forceArgs.push_back(&cu.getPosq().getDevicePointer());
forceArgs.push_back(cu.getPeriodicBoxSizePointer());
forceArgs.push_back(cu.getInvPeriodicBoxSizePointer());
if (nonbondedMethod != NoCutoff) {
forceArgs.push_back(&neighbors->getDevicePointer());
forceArgs.push_back(&neighborStartIndex->getDevicePointer());
}
if (particleTypes != NULL) {
forceArgs.push_back(&particleTypes->getDevicePointer());
forceArgs.push_back(&orderIndex->getDevicePointer());
...
...
@@ -4967,7 +4974,7 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
cu.executeKernel(startIndicesKernel, &startIndicesArgs[0], 256, 256, 256*sizeof(int));
cu.executeKernel(copyPairsKernel, ©PairsArgs[0], maxNeighborPairs);
}
cu.executeKernel(forceKernel, &forceArgs[0], cu.getNumAtoms()*
CudaContext::ThreadBlockSize, CudaContext::ThreadBlock
Size);
cu.executeKernel(forceKernel, &forceArgs[0], cu.getNumAtoms()*
forceWorkgroupSize, forceWorkgroup
Size);
if (nonbondedMethod != NoCutoff) {
// Make sure there was enough memory for the neighbor list.
...
...
platforms/cuda/src/kernels/customManyParticle.cu
View file @
bddaf4e7
...
...
@@ -74,55 +74,15 @@ inline __device__ bool isInteractionExcluded(int atom1, int atom2, int* __restri
return
false
;
}
#define WARP_SIZE 32
/**
* Perform a parallel prefix sum of boolean values over an array. This is done as the first stage of compacting an array.
*/
__device__
void
prefixSum
(
bool
value
,
short
*
sum
,
ushort2
*
temp
)
{
#if __CUDA_ARCH__ >= 300
const
int
indexInWarp
=
threadIdx
.
x
%
WARP_SIZE
;
const
int
warpMask
=
(
2
<<
indexInWarp
)
-
1
;
temp
[
threadIdx
.
x
].
x
=
__popc
(
__ballot
(
value
)
&
warpMask
);
__syncthreads
();
if
(
threadIdx
.
x
<
WARP_SIZE
)
{
int
multiWarpSum
=
temp
[(
threadIdx
.
x
+
1
)
*
WARP_SIZE
-
1
].
x
;
for
(
int
offset
=
1
;
offset
<
blockDim
.
x
/
WARP_SIZE
;
offset
*=
2
)
{
short
n
=
__shfl_up
(
multiWarpSum
,
offset
,
WARP_SIZE
);
if
(
indexInWarp
>=
offset
)
multiWarpSum
+=
n
;
}
temp
[
threadIdx
.
x
].
y
=
multiWarpSum
;
}
__syncthreads
();
sum
[
threadIdx
.
x
]
=
temp
[
threadIdx
.
x
].
x
+
(
threadIdx
.
x
<
WARP_SIZE
?
0
:
temp
[
threadIdx
.
x
/
WARP_SIZE
-
1
].
y
);
__syncthreads
();
#else
temp
[
threadIdx
.
x
].
x
=
value
;
__syncthreads
();
int
whichBuffer
=
0
;
for
(
int
offset
=
1
;
offset
<
blockDim
.
x
;
offset
*=
2
)
{
if
(
whichBuffer
==
0
)
temp
[
threadIdx
.
x
].
y
=
(
threadIdx
.
x
<
offset
?
temp
[
threadIdx
.
x
].
x
:
temp
[
threadIdx
.
x
].
x
+
temp
[
threadIdx
.
x
-
offset
].
x
);
else
temp
[
threadIdx
.
x
].
x
=
(
threadIdx
.
x
<
offset
?
temp
[
threadIdx
.
x
].
y
:
temp
[
threadIdx
.
x
].
y
+
temp
[
threadIdx
.
x
-
offset
].
y
);
whichBuffer
=
1
-
whichBuffer
;
__syncthreads
();
}
if
(
whichBuffer
==
0
)
sum
[
threadIdx
.
x
]
=
temp
[
threadIdx
.
x
].
x
;
else
sum
[
threadIdx
.
x
]
=
temp
[
threadIdx
.
x
].
y
;
__syncthreads
();
#endif
}
/**
* Compute the interaction.
*/
extern
"C"
__global__
void
computeInteraction
(
unsigned
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
#ifdef USE_CUTOFF
,
const
int
*
__restrict__
neighbors
,
const
int
*
__restrict__
neighborStartIndex
#endif
#ifdef USE_FILTERS
,
int
*
__restrict__
particleTypes
,
int
*
__restrict__
orderIndex
,
int
*
__restrict__
particleOrder
#endif
...
...
@@ -135,7 +95,12 @@ extern "C" __global__ void computeInteraction(
// Loop over particles to be the first one in the set.
for
(
int
p1
=
blockIdx
.
x
;
p1
<
NUM_ATOMS
;
p1
+=
gridDim
.
x
)
{
#ifdef USE_CUTOFF
int
firstNeighbor
=
neighborStartIndex
[
p1
];
int
numNeighbors
=
neighborStartIndex
[
p1
+
1
]
-
firstNeighbor
;
#else
int
numNeighbors
=
NUM_ATOMS
-
p1
-
1
;
#endif
int
numCombinations
=
NUM_CANDIDATE_COMBINATIONS
;
for
(
int
index
=
threadIdx
.
x
;
index
<
numCombinations
;
index
+=
blockDim
.
x
)
{
FIND_ATOMS_FOR_COMBINATION_INDEX
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment