Commit bddaf4e7 authored by peastman's avatar peastman
Browse files

CUDA version of CustomManyParticleForce uses neighbor list

parent e3b631f6
...@@ -964,7 +964,7 @@ private: ...@@ -964,7 +964,7 @@ private:
CudaContext& cu; CudaContext& cu;
bool hasInitializedKernel; bool hasInitializedKernel;
NonbondedMethod nonbondedMethod; NonbondedMethod nonbondedMethod;
int maxNeighborPairs; int maxNeighborPairs, forceWorkgroupSize;
CudaParameterSet* params; CudaParameterSet* params;
CudaArray* globals; CudaArray* globals;
CudaArray* particleTypes; CudaArray* particleTypes;
......
...@@ -4475,6 +4475,7 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con ...@@ -4475,6 +4475,7 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
int numParticles = force.getNumParticles(); int numParticles = force.getNumParticles();
int particlesPerSet = force.getNumParticlesPerSet(); int particlesPerSet = force.getNumParticlesPerSet();
nonbondedMethod = CalcCustomManyParticleForceKernel::NonbondedMethod(force.getNonbondedMethod()); nonbondedMethod = CalcCustomManyParticleForceKernel::NonbondedMethod(force.getNonbondedMethod());
forceWorkgroupSize = 128;
// Record parameter values. // Record parameter values.
...@@ -4804,19 +4805,21 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con ...@@ -4804,19 +4805,21 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
if (i > 1) if (i > 1)
numCombinations<<"*"; numCombinations<<"*";
numCombinations<<"numNeighbors"; numCombinations<<"numNeighbors";
if (nonbondedMethod == NoCutoff)
atomsForCombination<<"int p"<<(i+1)<<" = p1+1+tempIndex%numNeighbors;\n"; atomsForCombination<<"int p"<<(i+1)<<" = p1+1+tempIndex%numNeighbors;\n";
else
atomsForCombination<<"int p"<<(i+1)<<" = neighbors[firstNeighbor+tempIndex%numNeighbors];\n";
atomsForCombination<<"tempIndex /= numNeighbors;\n"; atomsForCombination<<"tempIndex /= numNeighbors;\n";
} }
if (nonbondedMethod != NoCutoff) { if (nonbondedMethod != NoCutoff) {
int startCheckFrom = 0; for (int i = 1; i < particlesPerSet; i++)
for (int i = startCheckFrom; i < particlesPerSet; i++)
verifyCutoff<<"real4 pos"<<(i+1)<<" = posq[p"<<(i+1)<<"];\n"; verifyCutoff<<"real4 pos"<<(i+1)<<" = posq[p"<<(i+1)<<"];\n";
for (int i = startCheckFrom; i < particlesPerSet; i++) for (int i = 1; i < particlesPerSet; i++)
for (int j = i+1; j < particlesPerSet; j++) for (int j = i+1; j < particlesPerSet; j++)
verifyCutoff<<"includeInteraction &= (delta(pos"<<(i+1)<<", pos"<<(j+1)<<", periodicBoxSize, invPeriodicBoxSize).w < CUTOFF_SQUARED);\n"; verifyCutoff<<"includeInteraction &= (delta(pos"<<(i+1)<<", pos"<<(j+1)<<", periodicBoxSize, invPeriodicBoxSize).w < CUTOFF_SQUARED);\n";
} }
if (force.getNumExclusions() > 0) { if (force.getNumExclusions() > 0) {
int startCheckFrom = 0; int startCheckFrom = (nonbondedMethod == NoCutoff ? 0 : 1);
for (int i = startCheckFrom; i < particlesPerSet; i++) for (int i = startCheckFrom; i < particlesPerSet; i++)
for (int j = i+1; j < particlesPerSet; j++) for (int j = i+1; j < particlesPerSet; j++)
verifyExclusions<<"includeInteraction &= !isInteractionExcluded(p"<<(i+1)<<", p"<<(j+1)<<", exclusions, exclusionStartIndex);\n"; verifyExclusions<<"includeInteraction &= !isInteractionExcluded(p"<<(i+1)<<", p"<<(j+1)<<", exclusions, exclusionStartIndex);\n";
...@@ -4883,6 +4886,10 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool ...@@ -4883,6 +4886,10 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
forceArgs.push_back(&cu.getPosq().getDevicePointer()); forceArgs.push_back(&cu.getPosq().getDevicePointer());
forceArgs.push_back(cu.getPeriodicBoxSizePointer()); forceArgs.push_back(cu.getPeriodicBoxSizePointer());
forceArgs.push_back(cu.getInvPeriodicBoxSizePointer()); forceArgs.push_back(cu.getInvPeriodicBoxSizePointer());
if (nonbondedMethod != NoCutoff) {
forceArgs.push_back(&neighbors->getDevicePointer());
forceArgs.push_back(&neighborStartIndex->getDevicePointer());
}
if (particleTypes != NULL) { if (particleTypes != NULL) {
forceArgs.push_back(&particleTypes->getDevicePointer()); forceArgs.push_back(&particleTypes->getDevicePointer());
forceArgs.push_back(&orderIndex->getDevicePointer()); forceArgs.push_back(&orderIndex->getDevicePointer());
...@@ -4967,7 +4974,7 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool ...@@ -4967,7 +4974,7 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
cu.executeKernel(startIndicesKernel, &startIndicesArgs[0], 256, 256, 256*sizeof(int)); cu.executeKernel(startIndicesKernel, &startIndicesArgs[0], 256, 256, 256*sizeof(int));
cu.executeKernel(copyPairsKernel, &copyPairsArgs[0], maxNeighborPairs); cu.executeKernel(copyPairsKernel, &copyPairsArgs[0], maxNeighborPairs);
} }
cu.executeKernel(forceKernel, &forceArgs[0], cu.getNumAtoms()*CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize); cu.executeKernel(forceKernel, &forceArgs[0], cu.getNumAtoms()*forceWorkgroupSize, forceWorkgroupSize);
if (nonbondedMethod != NoCutoff) { if (nonbondedMethod != NoCutoff) {
// Make sure there was enough memory for the neighbor list. // Make sure there was enough memory for the neighbor list.
......
...@@ -74,55 +74,15 @@ inline __device__ bool isInteractionExcluded(int atom1, int atom2, int* __restri ...@@ -74,55 +74,15 @@ inline __device__ bool isInteractionExcluded(int atom1, int atom2, int* __restri
return false; return false;
} }
#define WARP_SIZE 32
/**
* Perform a parallel prefix sum of boolean values over an array. This is done as the first stage of compacting an array.
*/
__device__ void prefixSum(bool value, short* sum, ushort2* temp) {
#if __CUDA_ARCH__ >= 300
const int indexInWarp = threadIdx.x%WARP_SIZE;
const int warpMask = (2<<indexInWarp)-1;
temp[threadIdx.x].x = __popc(__ballot(value)&warpMask);
__syncthreads();
if (threadIdx.x < WARP_SIZE) {
int multiWarpSum = temp[(threadIdx.x+1)*WARP_SIZE-1].x;
for (int offset = 1; offset < blockDim.x/WARP_SIZE; offset *= 2) {
short n = __shfl_up(multiWarpSum, offset, WARP_SIZE);
if (indexInWarp >= offset)
multiWarpSum += n;
}
temp[threadIdx.x].y = multiWarpSum;
}
__syncthreads();
sum[threadIdx.x] = temp[threadIdx.x].x+(threadIdx.x < WARP_SIZE ? 0 : temp[threadIdx.x/WARP_SIZE-1].y);
__syncthreads();
#else
temp[threadIdx.x].x = value;
__syncthreads();
int whichBuffer = 0;
for (int offset = 1; offset < blockDim.x; offset *= 2) {
if (whichBuffer == 0)
temp[threadIdx.x].y = (threadIdx.x < offset ? temp[threadIdx.x].x : temp[threadIdx.x].x+temp[threadIdx.x-offset].x);
else
temp[threadIdx.x].x = (threadIdx.x < offset ? temp[threadIdx.x].y : temp[threadIdx.x].y+temp[threadIdx.x-offset].y);
whichBuffer = 1-whichBuffer;
__syncthreads();
}
if (whichBuffer == 0)
sum[threadIdx.x] = temp[threadIdx.x].x;
else
sum[threadIdx.x] = temp[threadIdx.x].y;
__syncthreads();
#endif
}
/** /**
* Compute the interaction. * Compute the interaction.
*/ */
extern "C" __global__ void computeInteraction( extern "C" __global__ void computeInteraction(
unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
real4 periodicBoxSize, real4 invPeriodicBoxSize real4 periodicBoxSize, real4 invPeriodicBoxSize
#ifdef USE_CUTOFF
, const int* __restrict__ neighbors, const int* __restrict__ neighborStartIndex
#endif
#ifdef USE_FILTERS #ifdef USE_FILTERS
, int* __restrict__ particleTypes, int* __restrict__ orderIndex, int* __restrict__ particleOrder , int* __restrict__ particleTypes, int* __restrict__ orderIndex, int* __restrict__ particleOrder
#endif #endif
...@@ -135,7 +95,12 @@ extern "C" __global__ void computeInteraction( ...@@ -135,7 +95,12 @@ extern "C" __global__ void computeInteraction(
// Loop over particles to be the first one in the set. // Loop over particles to be the first one in the set.
for (int p1 = blockIdx.x; p1 < NUM_ATOMS; p1 += gridDim.x) { for (int p1 = blockIdx.x; p1 < NUM_ATOMS; p1 += gridDim.x) {
#ifdef USE_CUTOFF
int firstNeighbor = neighborStartIndex[p1];
int numNeighbors = neighborStartIndex[p1+1]-firstNeighbor;
#else
int numNeighbors = NUM_ATOMS-p1-1; int numNeighbors = NUM_ATOMS-p1-1;
#endif
int numCombinations = NUM_CANDIDATE_COMBINATIONS; int numCombinations = NUM_CANDIDATE_COMBINATIONS;
for (int index = threadIdx.x; index < numCombinations; index += blockDim.x) { for (int index = threadIdx.x; index < numCombinations; index += blockDim.x) {
FIND_ATOMS_FOR_COMBINATION_INDEX; FIND_ATOMS_FOR_COMBINATION_INDEX;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment