customManyParticle.cl 13.6 KB
Newer Older
1
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable

/**
 * Record the force on an atom to global memory.
 */
inline void storeForce(int atom, real4 force, __global long* restrict forceBuffers) {
    atom_add(&forceBuffers[atom], (long) (force.x*0x100000000));
    atom_add(&forceBuffers[atom+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
    atom_add(&forceBuffers[atom+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
}

/**
 * Compute the difference between two vectors, taking periodic boundary conditions into account
 * and setting the fourth component to the squared magnitude.
 */
17
inline real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
18
19
    real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
#ifdef USE_PERIODIC
20
    APPLY_PERIODIC_TO_DELTA(result)
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#endif
    result.w = result.x*result.x + result.y*result.y + result.z*result.z;
    return result;
}

/**
 * Compute the angle between two vectors.  The w component of each vector should contain the squared magnitude.
 */
real computeAngle(real4 vec1, real4 vec2) {
    real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
    real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
    real angle;
    if (cosine > 0.99f || cosine < -0.99f) {
        // We're close to the singularity in acos(), so take the cross product and use asin() instead.

        real4 crossProduct = cross(vec1, vec2);
        real scale = vec1.w*vec2.w;
        angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
        if (cosine < 0.0f)
            angle = M_PI-angle;
    }
    else
       angle = acos(cosine);
    return angle;
}

/**
 * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
 */
inline real4 computeCross(real4 vec1, real4 vec2) {
    real4 cp = cross(vec1, vec2);
    return (real4) (cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
}

/**
 * Determine whether a particular interaction is in the list of exclusions.
 */
58
inline bool isInteractionExcluded(int atom1, int atom2, __global const int* restrict exclusions, __global const int* restrict exclusionStartIndex) {
59
60
61
62
63
    if (atom1 > atom2) {
        int temp = atom1;
        atom1 = atom2;
        atom2 = temp;
    }
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
    int first = exclusionStartIndex[atom1];
    int last = exclusionStartIndex[atom1+1];
    for (int i = last-1; i >= first; i--) {
        int excluded = exclusions[i];
        if (excluded == atom2)
            return true;
        if (excluded <= atom1)
            return false;
    }
    return false;
}

/**
 * Compute the interaction.
 */
__kernel void computeInteraction(
80
        __global long* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq,
81
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
82
83
84
85
86
87
88
#ifdef USE_CUTOFF
        , __global const int* restrict neighbors, __global const int* restrict neighborStartIndex
#endif
#ifdef USE_FILTERS
        , __global int* restrict particleTypes, __global int* restrict orderIndex, __global int* restrict particleOrder
#endif
#ifdef USE_EXCLUSIONS
89
        , __global int* restrict exclusions, __global int* restrict exclusionStartIndex
90
91
#endif
        PARAMETER_ARGUMENTS) {
92
    mixed energy = 0;
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
    
    // Loop over particles to be the first one in the set.
    
    for (int p1 = get_group_id(0); p1 < NUM_ATOMS; p1 += get_num_groups(0)) {
#ifdef USE_CENTRAL_PARTICLE
        const int a1 = p1;
#else
        const int a1 = 0;
#endif
#ifdef USE_CUTOFF
        int firstNeighbor = neighborStartIndex[p1];
        int numNeighbors = neighborStartIndex[p1+1]-firstNeighbor;
#else
  #ifdef USE_CENTRAL_PARTICLE
        int numNeighbors = NUM_ATOMS;
  #else
        int numNeighbors = NUM_ATOMS-p1-1;
  #endif
#endif
        int numCombinations = NUM_CANDIDATE_COMBINATIONS;
        for (int index = get_local_id(0); index < numCombinations; index += get_local_size(0)) {
            FIND_ATOMS_FOR_COMBINATION_INDEX;
            bool includeInteraction = IS_VALID_COMBINATION;
#ifdef USE_CUTOFF
            if (includeInteraction) {
                VERIFY_CUTOFF;
            }
#endif
#ifdef USE_FILTERS
            int order = orderIndex[COMPUTE_TYPE_INDEX];
            if (order == -1)
                includeInteraction = false;
#endif
#ifdef USE_EXCLUSIONS
            if (includeInteraction) {
                VERIFY_EXCLUSIONS;
            }
#endif
            if (includeInteraction) {
                PERMUTE_ATOMS;
                LOAD_PARTICLE_DATA;
                COMPUTE_INTERACTION;
            }
        }
    }
    energyBuffer[get_global_id(0)] += energy;
}

/**
 * Find a bounding box for the atoms in each block.
 */
144
145
__kernel void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        __global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict numNeighborPairs) {
146
147
148
149
150
    int index = get_global_id(0);
    int base = index*TILE_SIZE;
    while (base < NUM_ATOMS) {
        real4 pos = posq[base];
#ifdef USE_PERIODIC
151
        APPLY_PERIODIC_TO_POS(pos)
152
153
154
155
156
157
158
159
#endif
        real4 minPos = pos;
        real4 maxPos = pos;
        int last = min(base+TILE_SIZE, NUM_ATOMS);
        for (int i = base+1; i < last; i++) {
            pos = posq[i];
#ifdef USE_PERIODIC
            real4 center = 0.5f*(maxPos+minPos);
160
            APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#endif
            minPos = (real4) (min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
            maxPos = (real4) (max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
        }
        real4 blockSize = 0.5f*(maxPos-minPos);
        blockBoundingBox[index] = blockSize;
        blockCenter[index] = 0.5f*(maxPos+minPos);
        index += get_global_size(0);
        base = index*TILE_SIZE;
    }
    if (get_group_id(0) == 0 && get_local_id(0) == 0)
        *numNeighborPairs = 0;
}

/**
 * Find a list of neighbors for each atom.
 */
178
179
__kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        __global const real4* restrict posq, __global const real4* restrict blockCenter, __global const real4* restrict blockBoundingBox, __global int2* restrict neighborPairs,
180
181
        __global int* restrict numNeighborPairs, __global int* restrict numNeighborsForAtom, int maxNeighborPairs
#ifdef USE_EXCLUSIONS
182
        , __global const int* restrict exclusions, __global const int* restrict exclusionStartIndex
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#endif
        ) {
    __local real4 positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE];
    __local bool includeBlockFlags[FIND_NEIGHBORS_WORKGROUP_SIZE];
    int indexInWarp = get_local_id(0)%32;
    int warpStart = get_local_id(0)-indexInWarp;
    for (int atom1 = get_global_id(0); atom1 < PADDED_NUM_ATOMS; atom1 += get_global_size(0)) {
        // Load data for this atom.  Note that all threads in a warp are processing atoms from the same block.
        
        real4 pos1 = posq[atom1];
        int block1 = atom1/TILE_SIZE;
        real4 blockCenter1 = blockCenter[block1];
        real4 blockSize1 = blockBoundingBox[block1];
        int totalNeighborsForAtom1 = 0;
        
        // Loop over atom blocks to search for neighbors.  The threads in a warp compare block1 against 32
        // other blocks in parallel.

#ifdef USE_CENTRAL_PARTICLE
        int startBlock = 0;
#else
        int startBlock = block1;
#endif
        for (int block2Base = startBlock; block2Base < NUM_BLOCKS; block2Base += 32) {
            int block2 = block2Base+indexInWarp;
            bool includeBlock2 = (block2 < NUM_BLOCKS);
            if (includeBlock2) {
                real4 blockCenter2 = blockCenter[block2];
                real4 blockSize2 = blockBoundingBox[block2];
                real4 blockDelta = blockCenter1-blockCenter2;
#ifdef USE_PERIODIC
214
                APPLY_PERIODIC_TO_DELTA(blockDelta)
215
#endif
216
217
218
                blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSize1.x-blockSize2.x);
                blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSize1.y-blockSize2.y);
                blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSize1.z-blockSize2.z);
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
                includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < CUTOFF_SQUARED);
            }
            
            // Loop over any blocks we identified as potentially containing neighbors.
            
            includeBlockFlags[get_local_id(0)] = includeBlock2;
            SYNC_WARPS;
            for (int i = 0; i < TILE_SIZE; i++) {
                if (includeBlockFlags[warpStart+i]) {
                    int block2 = block2Base+i;

                    // Loop over atoms in this block.

                    int start = block2*TILE_SIZE;
                    int included[TILE_SIZE];
                    int numIncluded = 0;
235
                    SYNC_WARPS;
236
                    positionCache[get_local_id(0)] = posq[start+indexInWarp];
237
                    SYNC_WARPS;
238
239
240
241
242
243
244
                    if (atom1 < NUM_ATOMS) {
                        for (int j = 0; j < 32; j++) {
                            int atom2 = start+j;
                            real4 pos2 = positionCache[get_local_id(0)-indexInWarp+j];

                            // Decide whether to include this atom pair in the neighbor list.

245
                            real4 atomDelta = delta(pos1, pos2, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#ifdef USE_CENTRAL_PARTICLE
                            bool includeAtom = (atom2 != atom1 && atom2 < NUM_ATOMS && atomDelta.w < CUTOFF_SQUARED);
#else
                            bool includeAtom = (atom2 > atom1 && atom2 < NUM_ATOMS && atomDelta.w < CUTOFF_SQUARED);
#endif
#ifdef USE_EXCLUSIONS
                            if (includeAtom)
                                includeAtom &= !isInteractionExcluded(atom1, atom2, exclusions, exclusionStartIndex);
#endif
                            if (includeAtom)
                                included[numIncluded++] = atom2;
                        }
                    }

                    // If we found any neighbors, store them to the neighbor list.

                    if (numIncluded > 0) {
                        int baseIndex = atom_add(numNeighborPairs, numIncluded);
                        if (baseIndex+numIncluded <= maxNeighborPairs)
                            for (int j = 0; j < numIncluded; j++)
                                neighborPairs[baseIndex+j] = (int2) (atom1, included[j]);
                        totalNeighborsForAtom1 += numIncluded;
                    }
                }
            }
        }
272
273
274
        if (atom1 < NUM_ATOMS)
            numNeighborsForAtom[atom1] = totalNeighborsForAtom1;
        SYNC_WARPS;
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
    }
}

/**
 * Sum the neighbor counts to compute the start position of each atom.  This kernel
 * is executed as a single work group.
 */
__kernel void computeNeighborStartIndices(__global int* restrict numNeighborsForAtom, __global int* restrict neighborStartIndex,
            __global int* restrict numNeighborPairs, int maxNeighborPairs) {
    __local unsigned int posBuffer[256];
    if (*numNeighborPairs > maxNeighborPairs) {
        // There wasn't enough memory for the neighbor list, so we'll need to rebuild it.  Set the neighbor start
        // indices to indicate no neighbors for any atom.
        
        for (int i = get_local_id(0); i <= NUM_ATOMS; i += get_local_size(0))
            neighborStartIndex[i] = 0;
        return;
    }
    unsigned int globalOffset = 0;
    for (unsigned int startAtom = 0; startAtom < NUM_ATOMS; startAtom += get_local_size(0)) {
        // Load the neighbor counts into local memory.

        unsigned int globalIndex = startAtom+get_local_id(0);
        posBuffer[get_local_id(0)] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0);
299
        barrier(CLK_LOCAL_MEM_FENCE);
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316

        // Perform a parallel prefix sum.

        for (unsigned int step = 1; step < get_local_size(0); step *= 2) {
            unsigned int add = (get_local_id(0) >= step ? posBuffer[get_local_id(0)-step] : 0);
            barrier(CLK_LOCAL_MEM_FENCE);
            posBuffer[get_local_id(0)] += add;
            barrier(CLK_LOCAL_MEM_FENCE);
        }

        // Write the results back to global memory.

        if (globalIndex < NUM_ATOMS) {
            neighborStartIndex[globalIndex+1] = posBuffer[get_local_id(0)]+globalOffset;
            numNeighborsForAtom[globalIndex] = 0; // Clear this so the next kernel can use it as a counter
        }
        globalOffset += posBuffer[get_local_size(0)-1];
317
        barrier(CLK_LOCAL_MEM_FENCE);
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
    }
    if (get_local_id(0) == 0)
        neighborStartIndex[0] = 0;
}

/**
 * Assemble the final neighbor list.
 */
__kernel void copyPairsToNeighborList(__global const int2* restrict neighborPairs, __global int* restrict neighbors, __global int* restrict numNeighborPairs,
            int maxNeighborPairs, __global int* restrict numNeighborsForAtom, __global const int* restrict neighborStartIndex) {
    int actualPairs = *numNeighborPairs;
    if (actualPairs > maxNeighborPairs)
        return; // There wasn't enough memory for the neighbor list, so we'll need to rebuild it.
    for (unsigned int index = get_global_id(0); index < actualPairs; index += get_global_size(0)) {
        int2 pair = neighborPairs[index];
        int startIndex = neighborStartIndex[pair.x];
        int offset = atom_add(numNeighborsForAtom+pair.x, 1);
        neighbors[startIndex+offset] = pair.y;
    }
}