customManyParticle.cc 13.6 KB
Newer Older
1
2
3
/**
 * Record the force on an atom to global memory.
 */
4
5
6
7
inline DEVICE void storeForce(int atom, real3 force, GLOBAL mm_ulong* RESTRICT forceBuffers) {
    ATOMIC_ADD(&forceBuffers[atom], (mm_ulong) ((mm_long) (force.x*0x100000000)));
    ATOMIC_ADD(&forceBuffers[atom+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
    ATOMIC_ADD(&forceBuffers[atom+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
8
9
10
11
12
13
}

/**
 * Compute the difference between two vectors, taking periodic boundary conditions into account
 * and setting the fourth component to the squared magnitude.
 */
14
15
inline DEVICE real4 delta(real3 vec1, real3 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
16
#ifdef USE_PERIODIC
17
    APPLY_PERIODIC_TO_DELTA(result)
18
19
20
21
22
23
24
25
#endif
    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
    return result;
}

/**
 * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
 */
26
DEVICE real computeAngle(real4 vec1, real4 vec2) {
27
28
29
30
31
32
    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
    real angle;
    if (cosine > 0.99f || cosine < -0.99f) {
        // We're close to the singularity in acos(), so take the cross product and use asin() instead.

33
        real3 crossProduct = trimTo3(cross(vec1, vec2));
34
        real scale = vec1.w*vec2.w;
35
        angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
36
37
38
39
        if (cosine < 0.0f)
            angle = M_PI-angle;
    }
    else
40
       angle = ACOS(cosine);
41
42
43
44
45
46
    return angle;
}

/**
 * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
 */
47
48
49
inline DEVICE real4 computeCross(real4 vec1, real4 vec2) {
    real3 cp = trimTo3(cross(vec1, vec2));
    return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
50
51
52
53
54
}

/**
 * Determine whether a particular interaction is in the list of exclusions.
 */
55
inline DEVICE bool isInteractionExcluded(int atom1, int atom2, GLOBAL const int* RESTRICT exclusions, GLOBAL const int* RESTRICT exclusionStartIndex) {
56
57
58
59
60
    if (atom1 > atom2) {
        int temp = atom1;
        atom1 = atom2;
        atom2 = temp;
    }
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
    int first = exclusionStartIndex[atom1];
    int last = exclusionStartIndex[atom1+1];
    for (int i = last-1; i >= first; i--) {
        int excluded = exclusions[i];
        if (excluded == atom2)
            return true;
        if (excluded <= atom1)
            return false;
    }
    return false;
}

/**
 * Compute the interaction.
 */
76
77
KERNEL void computeInteraction(
        GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq,
78
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
79
#ifdef USE_CUTOFF
80
        , GLOBAL const int* RESTRICT neighbors, GLOBAL const int* RESTRICT neighborStartIndex
81
82
#endif
#ifdef USE_FILTERS
83
        , GLOBAL int* RESTRICT particleTypes, GLOBAL int* RESTRICT orderIndex, GLOBAL int* RESTRICT particleOrder
84
85
#endif
#ifdef USE_EXCLUSIONS
86
        , GLOBAL int* RESTRICT exclusions, GLOBAL int* RESTRICT exclusionStartIndex
87
88
#endif
        PARAMETER_ARGUMENTS) {
89
    mixed energy = 0;
90
91
92
    
    // Loop over particles to be the first one in the set.
    
93
    for (int p1 = GROUP_ID; p1 < NUM_ATOMS; p1 += NUM_GROUPS) {
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#ifdef USE_CENTRAL_PARTICLE
        const int a1 = p1;
#else
        const int a1 = 0;
#endif
#ifdef USE_CUTOFF
        int firstNeighbor = neighborStartIndex[p1];
        int numNeighbors = neighborStartIndex[p1+1]-firstNeighbor;
#else
  #ifdef USE_CENTRAL_PARTICLE
        int numNeighbors = NUM_ATOMS;
  #else
        int numNeighbors = NUM_ATOMS-p1-1;
  #endif
#endif
        int numCombinations = NUM_CANDIDATE_COMBINATIONS;
110
        for (int index = LOCAL_ID; index < numCombinations; index += LOCAL_SIZE) {
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
            FIND_ATOMS_FOR_COMBINATION_INDEX;
            bool includeInteraction = IS_VALID_COMBINATION;
#ifdef USE_CUTOFF
            if (includeInteraction) {
                VERIFY_CUTOFF;
            }
#endif
#ifdef USE_FILTERS
            int order = orderIndex[COMPUTE_TYPE_INDEX];
            if (order == -1)
                includeInteraction = false;
#endif
#ifdef USE_EXCLUSIONS
            if (includeInteraction) {
                VERIFY_EXCLUSIONS;
            }
#endif
            if (includeInteraction) {
                PERMUTE_ATOMS;
                LOAD_PARTICLE_DATA;
                COMPUTE_INTERACTION;
            }
        }
    }
135
    energyBuffer[GLOBAL_ID] += energy;
136
137
138
139
140
}

/**
 * Find a bounding box for the atoms in each block.
 */
141
142
143
KERNEL void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        GLOBAL const real4* RESTRICT posq, GLOBAL real4* RESTRICT blockCenter, GLOBAL real4* RESTRICT blockBoundingBox, GLOBAL int* RESTRICT numNeighborPairs) {
    int index = GLOBAL_ID;
144
145
146
147
    int base = index*TILE_SIZE;
    while (base < NUM_ATOMS) {
        real4 pos = posq[base];
#ifdef USE_PERIODIC
148
        APPLY_PERIODIC_TO_POS(pos)
149
150
151
152
153
154
155
156
#endif
        real4 minPos = pos;
        real4 maxPos = pos;
        int last = min(base+TILE_SIZE, NUM_ATOMS);
        for (int i = base+1; i < last; i++) {
            pos = posq[i];
#ifdef USE_PERIODIC
            real4 center = 0.5f*(maxPos+minPos);
157
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
158
#endif
159
160
            minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
            maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
161
162
163
164
        }
        real4 blockSize = 0.5f*(maxPos-minPos);
        blockBoundingBox[index] = blockSize;
        blockCenter[index] = 0.5f*(maxPos+minPos);
165
        index += GLOBAL_SIZE;
166
167
        base = index*TILE_SIZE;
    }
168
    if (GROUP_ID == 0 && LOCAL_ID == 0)
169
170
171
172
173
174
        *numNeighborPairs = 0;
}

/**
 * Find a list of neighbors for each atom.
 */
175
176
177
KERNEL void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        GLOBAL const real4* RESTRICT posq, GLOBAL const real4* RESTRICT blockCenter, GLOBAL const real4* RESTRICT blockBoundingBox, GLOBAL int2* RESTRICT neighborPairs,
        GLOBAL int* RESTRICT numNeighborPairs, GLOBAL int* RESTRICT numNeighborsForAtom, int maxNeighborPairs
178
#ifdef USE_EXCLUSIONS
179
        , GLOBAL const int* RESTRICT exclusions, GLOBAL const int* RESTRICT exclusionStartIndex
180
181
#endif
        ) {
182
183
184
185
186
187
188
    LOCAL real3 positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE];
    int indexInWarp = LOCAL_ID%32;
#ifndef __CUDA_ARCH__
    LOCAL bool includeBlockFlags[FIND_NEIGHBORS_WORKGROUP_SIZE];
    int warpStart = LOCAL_ID-indexInWarp;
#endif
    for (int atom1 = GLOBAL_ID; atom1 < PADDED_NUM_ATOMS; atom1 += GLOBAL_SIZE) {
189
190
        // Load data for this atom.  Note that all threads in a warp are processing atoms from the same block.
        
191
        real3 pos1 = trimTo3(posq[atom1]);
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
        int block1 = atom1/TILE_SIZE;
        real4 blockCenter1 = blockCenter[block1];
        real4 blockSize1 = blockBoundingBox[block1];
        int totalNeighborsForAtom1 = 0;
        
        // Loop over atom blocks to search for neighbors.  The threads in a warp compare block1 against 32
        // other blocks in parallel.

#ifdef USE_CENTRAL_PARTICLE
        int startBlock = 0;
#else
        int startBlock = block1;
#endif
        for (int block2Base = startBlock; block2Base < NUM_BLOCKS; block2Base += 32) {
            int block2 = block2Base+indexInWarp;
            bool includeBlock2 = (block2 < NUM_BLOCKS);
            if (includeBlock2) {
                real4 blockCenter2 = blockCenter[block2];
                real4 blockSize2 = blockBoundingBox[block2];
                real4 blockDelta = blockCenter1-blockCenter2;
#ifdef USE_PERIODIC
213
                APPLY_PERIODIC_TO_DELTA(blockDelta)
214
#endif
215
216
217
                blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSize1.x-blockSize2.x);
                blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSize1.y-blockSize2.y);
                blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSize1.z-blockSize2.z);
218
219
220
221
222
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < CUTOFF_SQUARED);
            }
            
            // Loop over any blocks we identified as potentially containing neighbors.
            
223
224
225
226
227
228
229
230
#ifdef __CUDA_ARCH__
            int includeBlockFlags = BALLOT(includeBlock2);
            while (includeBlockFlags != 0) {
                int i = __ffs(includeBlockFlags)-1;
                includeBlockFlags &= includeBlockFlags-1;
                {
#else
            includeBlockFlags[LOCAL_ID] = includeBlock2;
231
232
233
            SYNC_WARPS;
            for (int i = 0; i < TILE_SIZE; i++) {
                if (includeBlockFlags[warpStart+i]) {
234
#endif
235
236
237
238
239
240
241
                    int block2 = block2Base+i;

                    // Loop over atoms in this block.

                    int start = block2*TILE_SIZE;
                    int included[TILE_SIZE];
                    int numIncluded = 0;
242
                    SYNC_WARPS;
243
                    positionCache[LOCAL_ID] = trimTo3(posq[start+indexInWarp]);
244
                    SYNC_WARPS;
245
246
247
                    if (atom1 < NUM_ATOMS) {
                        for (int j = 0; j < 32; j++) {
                            int atom2 = start+j;
248
                            real3 pos2 = positionCache[LOCAL_ID-indexInWarp+j];
249
250
251

                            // Decide whether to include this atom pair in the neighbor list.

252
                            real4 atomDelta = delta(pos1, pos2, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
#ifdef USE_CENTRAL_PARTICLE
                            bool includeAtom = (atom2 != atom1 && atom2 < NUM_ATOMS && atomDelta.w < CUTOFF_SQUARED);
#else
                            bool includeAtom = (atom2 > atom1 && atom2 < NUM_ATOMS && atomDelta.w < CUTOFF_SQUARED);
#endif
#ifdef USE_EXCLUSIONS
                            if (includeAtom)
                                includeAtom &= !isInteractionExcluded(atom1, atom2, exclusions, exclusionStartIndex);
#endif
                            if (includeAtom)
                                included[numIncluded++] = atom2;
                        }
                    }

                    // If we found any neighbors, store them to the neighbor list.

                    if (numIncluded > 0) {
270
                        int baseIndex = ATOMIC_ADD(numNeighborPairs, numIncluded);
271
272
                        if (baseIndex+numIncluded <= maxNeighborPairs)
                            for (int j = 0; j < numIncluded; j++)
273
                                neighborPairs[baseIndex+j] = make_int2(atom1, included[j]);
274
275
276
277
278
                        totalNeighborsForAtom1 += numIncluded;
                    }
                }
            }
        }
279
280
281
        if (atom1 < NUM_ATOMS)
            numNeighborsForAtom[atom1] = totalNeighborsForAtom1;
        SYNC_WARPS;
282
283
284
285
286
287
288
    }
}

/**
 * Sum the neighbor counts to compute the start position of each atom.  This kernel
 * is executed as a single work group.
 */
289
290
291
KERNEL void computeNeighborStartIndices(GLOBAL int* RESTRICT numNeighborsForAtom, GLOBAL int* RESTRICT neighborStartIndex,
            GLOBAL int* RESTRICT numNeighborPairs, int maxNeighborPairs) {
    LOCAL unsigned int posBuffer[256];
292
293
294
295
    if (*numNeighborPairs > maxNeighborPairs) {
        // There wasn't enough memory for the neighbor list, so we'll need to rebuild it.  Set the neighbor start
        // indices to indicate no neighbors for any atom.
        
296
        for (int i = LOCAL_ID; i <= NUM_ATOMS; i += LOCAL_SIZE)
297
298
299
300
            neighborStartIndex[i] = 0;
        return;
    }
    unsigned int globalOffset = 0;
301
    for (unsigned int startAtom = 0; startAtom < NUM_ATOMS; startAtom += LOCAL_SIZE) {
302
303
        // Load the neighbor counts into local memory.

304
305
306
        unsigned int globalIndex = startAtom+LOCAL_ID;
        posBuffer[LOCAL_ID] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0);
        SYNC_THREADS;
307
308
309

        // Perform a parallel prefix sum.

310
311
312
313
314
        for (unsigned int step = 1; step < LOCAL_SIZE; step *= 2) {
            unsigned int add = (LOCAL_ID >= step ? posBuffer[LOCAL_ID-step] : 0);
            SYNC_THREADS;
            posBuffer[LOCAL_ID] += add;
            SYNC_THREADS;
315
316
317
318
319
        }

        // Write the results back to global memory.

        if (globalIndex < NUM_ATOMS) {
320
            neighborStartIndex[globalIndex+1] = posBuffer[LOCAL_ID]+globalOffset;
321
322
            numNeighborsForAtom[globalIndex] = 0; // Clear this so the next kernel can use it as a counter
        }
323
324
        globalOffset += posBuffer[LOCAL_SIZE-1];
        SYNC_THREADS;
325
    }
326
    if (LOCAL_ID == 0)
327
328
329
330
331
332
        neighborStartIndex[0] = 0;
}

/**
 * Assemble the final neighbor list.
 */
333
334
KERNEL void copyPairsToNeighborList(GLOBAL const int2* RESTRICT neighborPairs, GLOBAL int* RESTRICT neighbors, GLOBAL int* RESTRICT numNeighborPairs,
            int maxNeighborPairs, GLOBAL int* RESTRICT numNeighborsForAtom, GLOBAL const int* RESTRICT neighborStartIndex) {
335
336
337
    int actualPairs = *numNeighborPairs;
    if (actualPairs > maxNeighborPairs)
        return; // There wasn't enough memory for the neighbor list, so we'll need to rebuild it.
338
    for (unsigned int index = GLOBAL_ID; index < actualPairs; index += GLOBAL_SIZE) {
339
340
        int2 pair = neighborPairs[index];
        int startIndex = neighborStartIndex[pair.x];
341
        int offset = ATOMIC_ADD(numNeighborsForAtom+pair.x, 1);
342
343
344
        neighbors[startIndex+offset] = pair.y;
    }
}