customManyParticle.cc 12.6 KB
Newer Older
1
2
3
/**
 * Record the force on an atom to global memory.
 */
4
inline DEVICE void storeForce(int atom, real3 force, GLOBAL mm_ulong* RESTRICT forceBuffers) {
5
6
7
    ATOMIC_ADD(&forceBuffers[atom], (mm_ulong) realToFixedPoint(force.x));
    ATOMIC_ADD(&forceBuffers[atom+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
    ATOMIC_ADD(&forceBuffers[atom+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
8
9
10
11
12
13
}

/**
 * Compute the difference between two vectors, taking periodic boundary conditions into account
 * and setting the fourth component to the squared magnitude.
 */
14
15
inline DEVICE real4 delta(real3 vec1, real3 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
    real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
16
#ifdef USE_PERIODIC
17
    APPLY_PERIODIC_TO_DELTA(result)
18
19
20
21
22
23
24
25
#endif
    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
    return result;
}

/**
 * Determine whether a particular interaction is in the list of exclusions.
 */
26
inline DEVICE bool isInteractionExcluded(int atom1, int atom2, GLOBAL const int* RESTRICT exclusions, GLOBAL const int* RESTRICT exclusionStartIndex) {
27
28
29
30
31
    if (atom1 > atom2) {
        int temp = atom1;
        atom1 = atom2;
        atom2 = temp;
    }
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
    int first = exclusionStartIndex[atom1];
    int last = exclusionStartIndex[atom1+1];
    for (int i = last-1; i >= first; i--) {
        int excluded = exclusions[i];
        if (excluded == atom2)
            return true;
        if (excluded <= atom1)
            return false;
    }
    return false;
}

/**
 * Compute the interaction.
 */
47
48
KERNEL void computeInteraction(
        GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq,
49
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
50
#ifdef USE_CUTOFF
51
        , GLOBAL const int* RESTRICT neighbors, GLOBAL const int* RESTRICT neighborStartIndex
52
53
#endif
#ifdef USE_FILTERS
54
        , GLOBAL int* RESTRICT particleTypes, GLOBAL int* RESTRICT orderIndex, GLOBAL int* RESTRICT particleOrder
55
56
#endif
#ifdef USE_EXCLUSIONS
57
        , GLOBAL int* RESTRICT exclusions, GLOBAL int* RESTRICT exclusionStartIndex
58
59
#endif
        PARAMETER_ARGUMENTS) {
60
    mixed energy = 0;
61
62
63
    
    // Loop over particles to be the first one in the set.
    
64
    for (int p1 = GROUP_ID; p1 < NUM_ATOMS; p1 += NUM_GROUPS) {
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#ifdef USE_CENTRAL_PARTICLE
        const int a1 = p1;
#else
        const int a1 = 0;
#endif
#ifdef USE_CUTOFF
        int firstNeighbor = neighborStartIndex[p1];
        int numNeighbors = neighborStartIndex[p1+1]-firstNeighbor;
#else
  #ifdef USE_CENTRAL_PARTICLE
        int numNeighbors = NUM_ATOMS;
  #else
        int numNeighbors = NUM_ATOMS-p1-1;
  #endif
#endif
        int numCombinations = NUM_CANDIDATE_COMBINATIONS;
81
        for (int index = LOCAL_ID; index < numCombinations; index += LOCAL_SIZE) {
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
            FIND_ATOMS_FOR_COMBINATION_INDEX;
            bool includeInteraction = IS_VALID_COMBINATION;
#ifdef USE_CUTOFF
            if (includeInteraction) {
                VERIFY_CUTOFF;
            }
#endif
#ifdef USE_FILTERS
            int order = orderIndex[COMPUTE_TYPE_INDEX];
            if (order == -1)
                includeInteraction = false;
#endif
#ifdef USE_EXCLUSIONS
            if (includeInteraction) {
                VERIFY_EXCLUSIONS;
            }
#endif
            if (includeInteraction) {
                PERMUTE_ATOMS;
                LOAD_PARTICLE_DATA;
                COMPUTE_INTERACTION;
            }
        }
    }
106
    energyBuffer[GLOBAL_ID] += energy;
107
108
109
110
111
}

/**
 * Find a bounding box for the atoms in each block.
 */
112
113
114
KERNEL void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        GLOBAL const real4* RESTRICT posq, GLOBAL real4* RESTRICT blockCenter, GLOBAL real4* RESTRICT blockBoundingBox, GLOBAL int* RESTRICT numNeighborPairs) {
    int index = GLOBAL_ID;
115
116
117
118
    int base = index*TILE_SIZE;
    while (base < NUM_ATOMS) {
        real4 pos = posq[base];
#ifdef USE_PERIODIC
119
        APPLY_PERIODIC_TO_POS(pos)
120
121
122
123
124
125
126
127
#endif
        real4 minPos = pos;
        real4 maxPos = pos;
        int last = min(base+TILE_SIZE, NUM_ATOMS);
        for (int i = base+1; i < last; i++) {
            pos = posq[i];
#ifdef USE_PERIODIC
            real4 center = 0.5f*(maxPos+minPos);
128
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
129
#endif
130
131
            minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
            maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
132
133
134
135
        }
        real4 blockSize = 0.5f*(maxPos-minPos);
        blockBoundingBox[index] = blockSize;
        blockCenter[index] = 0.5f*(maxPos+minPos);
136
        index += GLOBAL_SIZE;
137
138
        base = index*TILE_SIZE;
    }
139
    if (GROUP_ID == 0 && LOCAL_ID == 0)
140
141
142
143
144
145
        *numNeighborPairs = 0;
}

/**
 * Find a list of neighbors for each atom.
 */
146
147
148
KERNEL void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        GLOBAL const real4* RESTRICT posq, GLOBAL const real4* RESTRICT blockCenter, GLOBAL const real4* RESTRICT blockBoundingBox, GLOBAL int2* RESTRICT neighborPairs,
        GLOBAL int* RESTRICT numNeighborPairs, GLOBAL int* RESTRICT numNeighborsForAtom, int maxNeighborPairs
149
#ifdef USE_EXCLUSIONS
150
        , GLOBAL const int* RESTRICT exclusions, GLOBAL const int* RESTRICT exclusionStartIndex
151
152
#endif
        ) {
153
154
    LOCAL real3 positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE];
    int indexInWarp = LOCAL_ID%32;
155
#if !(defined(__CUDA_ARCH__) || defined(USE_HIP))
156
157
158
159
    LOCAL bool includeBlockFlags[FIND_NEIGHBORS_WORKGROUP_SIZE];
    int warpStart = LOCAL_ID-indexInWarp;
#endif
    for (int atom1 = GLOBAL_ID; atom1 < PADDED_NUM_ATOMS; atom1 += GLOBAL_SIZE) {
160
161
        // Load data for this atom.  Note that all threads in a warp are processing atoms from the same block.
        
162
        real3 pos1 = trimTo3(posq[atom1]);
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
        int block1 = atom1/TILE_SIZE;
        real4 blockCenter1 = blockCenter[block1];
        real4 blockSize1 = blockBoundingBox[block1];
        int totalNeighborsForAtom1 = 0;
        
        // Loop over atom blocks to search for neighbors.  The threads in a warp compare block1 against 32
        // other blocks in parallel.

#ifdef USE_CENTRAL_PARTICLE
        int startBlock = 0;
#else
        int startBlock = block1;
#endif
        for (int block2Base = startBlock; block2Base < NUM_BLOCKS; block2Base += 32) {
            int block2 = block2Base+indexInWarp;
            bool includeBlock2 = (block2 < NUM_BLOCKS);
            if (includeBlock2) {
                real4 blockCenter2 = blockCenter[block2];
                real4 blockSize2 = blockBoundingBox[block2];
                real4 blockDelta = blockCenter1-blockCenter2;
#ifdef USE_PERIODIC
184
                APPLY_PERIODIC_TO_DELTA(blockDelta)
185
#endif
186
187
188
                blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSize1.x-blockSize2.x);
                blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSize1.y-blockSize2.y);
                blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSize1.z-blockSize2.z);
189
190
191
192
193
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < CUTOFF_SQUARED);
            }
            
            // Loop over any blocks we identified as potentially containing neighbors.
            
194
#if defined(__CUDA_ARCH__) || defined(USE_HIP)
195
196
197
198
199
200
201
            int includeBlockFlags = BALLOT(includeBlock2);
            while (includeBlockFlags != 0) {
                int i = __ffs(includeBlockFlags)-1;
                includeBlockFlags &= includeBlockFlags-1;
                {
#else
            includeBlockFlags[LOCAL_ID] = includeBlock2;
202
203
204
            SYNC_WARPS;
            for (int i = 0; i < TILE_SIZE; i++) {
                if (includeBlockFlags[warpStart+i]) {
205
#endif
206
207
208
209
210
211
212
                    int block2 = block2Base+i;

                    // Loop over atoms in this block.

                    int start = block2*TILE_SIZE;
                    int included[TILE_SIZE];
                    int numIncluded = 0;
213
                    SYNC_WARPS;
214
                    positionCache[LOCAL_ID] = trimTo3(posq[start+indexInWarp]);
215
                    SYNC_WARPS;
216
217
218
                    if (atom1 < NUM_ATOMS) {
                        for (int j = 0; j < 32; j++) {
                            int atom2 = start+j;
219
                            real3 pos2 = positionCache[LOCAL_ID-indexInWarp+j];
220
221
222

                            // Decide whether to include this atom pair in the neighbor list.

223
                            real4 atomDelta = delta(pos1, pos2, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#ifdef USE_CENTRAL_PARTICLE
                            bool includeAtom = (atom2 != atom1 && atom2 < NUM_ATOMS && atomDelta.w < CUTOFF_SQUARED);
#else
                            bool includeAtom = (atom2 > atom1 && atom2 < NUM_ATOMS && atomDelta.w < CUTOFF_SQUARED);
#endif
#ifdef USE_EXCLUSIONS
                            if (includeAtom)
                                includeAtom &= !isInteractionExcluded(atom1, atom2, exclusions, exclusionStartIndex);
#endif
                            if (includeAtom)
                                included[numIncluded++] = atom2;
                        }
                    }

                    // If we found any neighbors, store them to the neighbor list.

                    if (numIncluded > 0) {
241
                        int baseIndex = ATOMIC_ADD(numNeighborPairs, numIncluded);
242
243
                        if (baseIndex+numIncluded <= maxNeighborPairs)
                            for (int j = 0; j < numIncluded; j++)
244
                                neighborPairs[baseIndex+j] = make_int2(atom1, included[j]);
245
246
247
248
249
                        totalNeighborsForAtom1 += numIncluded;
                    }
                }
            }
        }
250
251
252
        if (atom1 < NUM_ATOMS)
            numNeighborsForAtom[atom1] = totalNeighborsForAtom1;
        SYNC_WARPS;
253
254
255
256
257
258
259
    }
}

/**
 * Sum the neighbor counts to compute the start position of each atom.  This kernel
 * is executed as a single work group.
 */
260
261
262
KERNEL void computeNeighborStartIndices(GLOBAL int* RESTRICT numNeighborsForAtom, GLOBAL int* RESTRICT neighborStartIndex,
            GLOBAL int* RESTRICT numNeighborPairs, int maxNeighborPairs) {
    LOCAL unsigned int posBuffer[256];
263
264
265
266
    if (*numNeighborPairs > maxNeighborPairs) {
        // There wasn't enough memory for the neighbor list, so we'll need to rebuild it.  Set the neighbor start
        // indices to indicate no neighbors for any atom.
        
267
        for (int i = LOCAL_ID; i <= NUM_ATOMS; i += LOCAL_SIZE)
268
269
270
271
            neighborStartIndex[i] = 0;
        return;
    }
    unsigned int globalOffset = 0;
272
    for (unsigned int startAtom = 0; startAtom < NUM_ATOMS; startAtom += LOCAL_SIZE) {
273
274
        // Load the neighbor counts into local memory.

275
276
277
        unsigned int globalIndex = startAtom+LOCAL_ID;
        posBuffer[LOCAL_ID] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0);
        SYNC_THREADS;
278
279
280

        // Perform a parallel prefix sum.

281
282
283
284
285
        for (unsigned int step = 1; step < LOCAL_SIZE; step *= 2) {
            unsigned int add = (LOCAL_ID >= step ? posBuffer[LOCAL_ID-step] : 0);
            SYNC_THREADS;
            posBuffer[LOCAL_ID] += add;
            SYNC_THREADS;
286
287
288
289
290
        }

        // Write the results back to global memory.

        if (globalIndex < NUM_ATOMS) {
291
            neighborStartIndex[globalIndex+1] = posBuffer[LOCAL_ID]+globalOffset;
292
293
            numNeighborsForAtom[globalIndex] = 0; // Clear this so the next kernel can use it as a counter
        }
294
295
        globalOffset += posBuffer[LOCAL_SIZE-1];
        SYNC_THREADS;
296
    }
297
    if (LOCAL_ID == 0)
298
299
300
301
302
303
        neighborStartIndex[0] = 0;
}

/**
 * Assemble the final neighbor list.
 */
304
305
KERNEL void copyPairsToNeighborList(GLOBAL const int2* RESTRICT neighborPairs, GLOBAL int* RESTRICT neighbors, GLOBAL int* RESTRICT numNeighborPairs,
            int maxNeighborPairs, GLOBAL int* RESTRICT numNeighborsForAtom, GLOBAL const int* RESTRICT neighborStartIndex) {
306
307
308
    int actualPairs = *numNeighborPairs;
    if (actualPairs > maxNeighborPairs)
        return; // There wasn't enough memory for the neighbor list, so we'll need to rebuild it.
309
    for (unsigned int index = GLOBAL_ID; index < actualPairs; index += GLOBAL_SIZE) {
310
311
        int2 pair = neighborPairs[index];
        int startIndex = neighborStartIndex[pair.x];
312
        int offset = ATOMIC_ADD(numNeighborsForAtom+pair.x, 1);
313
314
315
        neighbors[startIndex+offset] = pair.y;
    }
}