customManyParticle.cu 13.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/**
 * Record the force on an atom to global memory.
 */
inline __device__ void storeForce(int atom, real3 force, unsigned long long* __restrict__ forceBuffers) {
    atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
    atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
    atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
}

/**
 * Convert a real4 to a real3 by removing its last element.
 */
inline __device__ real3 trim(real4 v) {
    return make_real3(v.x, v.y, v.z);
}

/**
18
19
 * Compute the difference between two vectors, taking periodic boundary conditions into account
 * and setting the fourth component to the squared magnitude.
20
 */
21
inline __device__ real4 delta(real3 vec1, real3 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
22
23
24
25
26
27
    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
#ifdef USE_PERIODIC
    result.x -= floor(result.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
    result.y -= floor(result.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
    result.z -= floor(result.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
    return result;
}

/**
 * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
 */
__device__ real computeAngle(real4 vec1, real4 vec2) {
    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
    real angle;
    if (cosine > 0.99f || cosine < -0.99f) {
        // We're close to the singularity in acos(), so take the cross product and use asin() instead.

        real3 crossProduct = cross(vec1, vec2);
        real scale = vec1.w*vec2.w;
        angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
        if (cosine < 0.0f)
            angle = M_PI-angle;
    }
    else
       angle = ACOS(cosine);
    return angle;
}

/**
 * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
 */
inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
    real3 cp = cross(vec1, vec2);
    return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
}

61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
/**
 * Determine whether a particular interaction is in the list of exclusions.
 */
inline __device__ bool isInteractionExcluded(int atom1, int atom2, int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex) {
    int first = exclusionStartIndex[atom1];
    int last = exclusionStartIndex[atom1+1];
    for (int i = last-1; i >= first; i--) {
        int excluded = exclusions[i];
        if (excluded == atom2)
            return true;
        if (excluded <= atom1)
            return false;
    }
    return false;
}

77
78
__constant__ float globals[NUM_GLOBALS];

79
80
81
82
/**
 * Compute the interaction.
 */
extern "C" __global__ void computeInteraction(
83
84
        unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
        real4 periodicBoxSize, real4 invPeriodicBoxSize
85
86
87
#ifdef USE_CUTOFF
        , const int* __restrict__ neighbors, const int* __restrict__ neighborStartIndex
#endif
88
89
#ifdef USE_FILTERS
        , int* __restrict__ particleTypes, int* __restrict__ orderIndex, int* __restrict__ particleOrder
90
91
92
#endif
#ifdef USE_EXCLUSIONS
        , int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex
93
#endif
94
95
96
97
98
99
        PARAMETER_ARGUMENTS) {
    real energy = 0.0f;
    
    // Loop over particles to be the first one in the set.
    
    for (int p1 = blockIdx.x; p1 < NUM_ATOMS; p1 += gridDim.x) {
100
101
102
#ifdef USE_CENTRAL_PARTICLE
        const int a1 = p1;
#else
103
        const int a1 = 0;
104
#endif
105
106
107
108
#ifdef USE_CUTOFF
        int firstNeighbor = neighborStartIndex[p1];
        int numNeighbors = neighborStartIndex[p1+1]-firstNeighbor;
#else
109
110
111
  #ifdef USE_CENTRAL_PARTICLE
        int numNeighbors = NUM_ATOMS;
  #else
112
        int numNeighbors = NUM_ATOMS-p1-1;
113
  #endif
114
#endif
115
116
117
118
        int numCombinations = NUM_CANDIDATE_COMBINATIONS;
        for (int index = threadIdx.x; index < numCombinations; index += blockDim.x) {
            FIND_ATOMS_FOR_COMBINATION_INDEX;
            bool includeInteraction = IS_VALID_COMBINATION;
119
120
121
122
#ifdef USE_CUTOFF
            if (includeInteraction) {
                VERIFY_CUTOFF;
            }
123
124
125
126
127
#endif
#ifdef USE_FILTERS
            int order = orderIndex[COMPUTE_TYPE_INDEX];
            if (order == -1)
                includeInteraction = false;
128
129
130
131
132
#endif
#ifdef USE_EXCLUSIONS
            if (includeInteraction) {
                VERIFY_EXCLUSIONS;
            }
133
#endif
134
135
136
137
138
139
140
141
            if (includeInteraction) {
                PERMUTE_ATOMS;
                LOAD_PARTICLE_DATA;
                COMPUTE_INTERACTION;
            }
        }
    }
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
}

/**
 * Find a bounding box for the atoms in each block.
 */
extern "C" __global__ void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq,
        real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ numNeighborPairs) {
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    int base = index*TILE_SIZE;
    while (base < NUM_ATOMS) {
        real4 pos = posq[base];
#ifdef USE_PERIODIC
        pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
        pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
        pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
#endif
        real4 minPos = pos;
        real4 maxPos = pos;
        int last = min(base+TILE_SIZE, NUM_ATOMS);
        for (int i = base+1; i < last; i++) {
            pos = posq[i];
#ifdef USE_PERIODIC
            real4 center = 0.5f*(maxPos+minPos);
            pos.x -= floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
            pos.y -= floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
            pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
            minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
            maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
        }
        real4 blockSize = 0.5f*(maxPos-minPos);
        blockBoundingBox[index] = blockSize;
        blockCenter[index] = 0.5f*(maxPos+minPos);
        index += blockDim.x*gridDim.x;
        base = index*TILE_SIZE;
    }
    if (blockIdx.x == 0 && threadIdx.x == 0)
        *numNeighborPairs = 0;
}

/**
 * Find a list of neighbors for each atom.
 */
extern "C" __global__ void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq,
        const real4* __restrict__ blockCenter, const real4* __restrict__ blockBoundingBox, int2* __restrict__ neighborPairs,
        int* __restrict__ numNeighborPairs, int* __restrict__ numNeighborsForAtom, int maxNeighborPairs
#ifdef USE_EXCLUSIONS
        , int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex
#endif
        ) {
192
193
    __shared__ real3 positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE];
    int indexInWarp = threadIdx.x%32;
194
    for (int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < PADDED_NUM_ATOMS; atom1 += blockDim.x*gridDim.x) {
195
        // Load data for this atom.  Note that all threads in a warp are processing atoms from the same block.
196
        
197
        real3 pos1 = trim(posq[atom1]);
198
199
200
201
202
        int block1 = atom1/TILE_SIZE;
        real4 blockCenter1 = blockCenter[block1];
        real4 blockSize1 = blockBoundingBox[block1];
        int totalNeighborsForAtom1 = 0;
        
203
204
205
        // Loop over atom blocks to search for neighbors.  The threads in a warp compare block1 against 32
        // other blocks in parallel.

206
207
208
209
210
211
#ifdef USE_CENTRAL_PARTICLE
        int startBlock = 0;
#else
        int startBlock = block1;
#endif
        for (int block2Base = startBlock; block2Base < NUM_BLOCKS; block2Base += 32) {
212
213
214
215
216
217
            int block2 = block2Base+indexInWarp;
            bool includeBlock2 = (block2 < NUM_BLOCKS);
            if (includeBlock2) {
                real4 blockCenter2 = blockCenter[block2];
                real4 blockSize2 = blockBoundingBox[block2];
                real4 blockDelta = blockCenter1-blockCenter2;
218
#ifdef USE_PERIODIC
219
220
221
                blockDelta.x -= floor(blockDelta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
                blockDelta.y -= floor(blockDelta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
                blockDelta.z -= floor(blockDelta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
222
#endif
223
224
225
226
227
228
229
230
231
232
233
234
235
236
                blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSize1.x-blockSize2.x);
                blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSize1.y-blockSize2.y);
                blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSize1.z-blockSize2.z);
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < CUTOFF_SQUARED);
            }
            
            // Loop over any blocks we identified as potentially containing neighbors.
            
            int includeBlockFlags = __ballot(includeBlock2);
            while (includeBlockFlags != 0) {
                int i = __ffs(includeBlockFlags)-1;
                includeBlockFlags &= includeBlockFlags-1;
                int block2 = block2Base+i;

237
                // Loop over atoms in this block.
238

239
240
241
                int start = block2*TILE_SIZE;
                int included[TILE_SIZE];
                int numIncluded = 0;
242
                positionCache[threadIdx.x] = trim(posq[start+indexInWarp]);
243
244
245
246
                if (atom1 < NUM_ATOMS) {
                    for (int j = 0; j < 32; j++) {
                        int atom2 = start+j;
                        real3 pos2 = positionCache[threadIdx.x-indexInWarp+j];
247

248
                        // Decide whether to include this atom pair in the neighbor list.
249

250
                        real4 atomDelta = delta(pos1, pos2, periodicBoxSize, invPeriodicBoxSize);
251
#ifdef USE_CENTRAL_PARTICLE
252
                        bool includeAtom = (atom2 != atom1 && atom2 < NUM_ATOMS && atomDelta.w < CUTOFF_SQUARED);
253
#else
254
                        bool includeAtom = (atom2 > atom1 && atom2 < NUM_ATOMS && atomDelta.w < CUTOFF_SQUARED);
255
#endif
256
#ifdef USE_EXCLUSIONS
257
258
                        if (includeAtom)
                            includeAtom &= !isInteractionExcluded(atom1, atom2, exclusions, exclusionStartIndex);
259
#endif
260
261
262
                        if (includeAtom)
                            included[numIncluded++] = atom2;
                    }
263
                }
264

265
                // If we found any neighbors, store them to the neighbor list.
266

267
268
269
                if (numIncluded > 0) {
                    int baseIndex = atomicAdd(numNeighborPairs, numIncluded);
                    if (baseIndex+numIncluded <= maxNeighborPairs)
270
271
                        for (int j = 0; j < numIncluded; j++)
                            neighborPairs[baseIndex+j] = make_int2(atom1, included[j]);
272
273
274
275
276
277
278
279
280
281
282
283
                    totalNeighborsForAtom1 += numIncluded;
                }
            }
        }
        numNeighborsForAtom[atom1] = totalNeighborsForAtom1;
    }
}

/**
 * Sum the neighbor counts to compute the start position of each atom.  This kernel
 * is executed as a single work group.
 */
284
285
extern "C" __global__ void computeNeighborStartIndices(int* __restrict__ numNeighborsForAtom, int* __restrict__ neighborStartIndex,
            int* __restrict__ numNeighborPairs, int maxNeighborPairs) {
286
    extern __shared__ unsigned int posBuffer[];
287
288
289
290
291
292
293
294
    if (*numNeighborPairs > maxNeighborPairs) {
        // There wasn't enough memory for the neighbor list, so we'll need to rebuild it.  Set the neighbor start
        // indices to indicate no neighbors for any atom.
        
        for (int i = threadIdx.x; i <= NUM_ATOMS; i += blockDim.x)
            neighborStartIndex[i] = 0;
        return;
    }
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
    unsigned int globalOffset = 0;
    for (unsigned int startAtom = 0; startAtom < NUM_ATOMS; startAtom += blockDim.x) {
        // Load the neighbor counts into local memory.

        unsigned int globalIndex = startAtom+threadIdx.x;
        posBuffer[threadIdx.x] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0);
        __syncthreads();

        // Perform a parallel prefix sum.

        for (unsigned int step = 1; step < blockDim.x; step *= 2) {
            unsigned int add = (threadIdx.x >= step ? posBuffer[threadIdx.x-step] : 0);
            __syncthreads();
            posBuffer[threadIdx.x] += add;
            __syncthreads();
        }

        // Write the results back to global memory.

314
        if (globalIndex < NUM_ATOMS) {
315
            neighborStartIndex[globalIndex+1] = posBuffer[threadIdx.x]+globalOffset;
316
317
            numNeighborsForAtom[globalIndex] = 0; // Clear this so the next kernel can use it as a counter
        }
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
        globalOffset += posBuffer[blockDim.x-1];
    }
    if (threadIdx.x == 0)
        neighborStartIndex[0] = 0;
}

/**
 * Assemble the final neighbor list.
 */
extern "C" __global__ void copyPairsToNeighborList(const int2* __restrict__ neighborPairs, int* __restrict__ neighbors, int* __restrict__ numNeighborPairs,
            int maxNeighborPairs, int* __restrict__ numNeighborsForAtom, const int* __restrict__ neighborStartIndex) {
    int actualPairs = *numNeighborPairs;
    if (actualPairs > maxNeighborPairs)
        return; // There wasn't enough memory for the neighbor list, so we'll need to rebuild it.
    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < actualPairs; index += blockDim.x*gridDim.x) {
        int2 pair = neighborPairs[index];
        int startIndex = neighborStartIndex[pair.x];
        int offset = atomicAdd(numNeighborsForAtom+pair.x, 1);
        neighbors[startIndex+offset] = pair.y;
    }
}